add code

add second loop
changed w&b report link (#6387 )
2026-02-14 14:55:26 +08:00 · 2024-01-04 16:22:47 +05:30 · 2024-01-04 14:53:33 +05:30 · 2023-12-29 19:49:11 +05:30 · 2023-12-29 09:38:50 +05:30 · 2023-12-29 09:33:49 +05:30
449 changed files with 51065 additions and 8695 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,52 @@
+name: Benchmarking tests
+
+on:
+  schedule:
+    - cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM
+
+env:
+  DIFFUSERS_IS_CI: yes
+  HF_HOME: /mnt/cache
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+
+jobs:
+  torch_pipelines_cuda_benchmark_tests:
+    name: Torch Core Pipelines CUDA Benchmarking Tests
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+    runs-on: [single-gpu, nvidia-gpu, a10, ci]
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+      - name: Install dependencies
+        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
+          python -m pip install -e .[quality,test]
+          python -m pip install pandas
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Diffusers Benchmarking
+        env:
+            HUGGING_FACE_HUB_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
+            BASE_PATH: benchmark_outputs
+        run: |
+          export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
+          cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: benchmark_test_reports
+          path: benchmarks/benchmark_outputs
--- a/.github/workflows/delete_doc_comment.yml
+++ b/.github/workflows/delete_doc_comment.yml
@@ -1,14 +0,0 @@
-name: Delete doc comment
-
-on:
-  workflow_run:
-    workflows: ["Delete doc comment trigger"]
-    types:
-      - completed
-
-
-jobs:
-  delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
-    secrets:
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--- a/.github/workflows/delete_doc_comment_trigger.yml
+++ b/.github/workflows/delete_doc_comment_trigger.yml
@@ -1,12 +0,0 @@
-name: Delete doc comment trigger
-
-on:
-  pull_request:
-    types: [ closed ]
-
-
-jobs:
-  delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
-    with:
-      pr_number: ${{ github.event.number }}
--- a/.github/workflows/pr_test_fetcher.yml
+++ b/.github/workflows/pr_test_fetcher.yml
@@ -0,0 +1,170 @@
+name: Fast tests for PRs - Test Fetcher
+
+on: workflow_dispatch
+
+env:
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 4
+  MKL_NUM_THREADS: 4
+  PYTEST_TIMEOUT: 60
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  setup_pr_tests:
+    name: Setup PR Tests
+    runs-on: docker-cpu
+    container:
+      image: diffusers/diffusers-pytorch-cpu
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+    defaults:
+      run:
+        shell: bash
+    outputs:
+      matrix: ${{ steps.set_matrix.outputs.matrix }}
+      test_map: ${{ steps.set_matrix.outputs.test_map }}
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+    - name: Install dependencies
+      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
+        python -m pip install -e .[quality,test]
+    - name: Environment
+      run: |
+        python utils/print_env.py
+        echo $(git --version)
+    - name: Fetch Tests
+      run: |
+        python utils/tests_fetcher.py | tee test_preparation.txt
+    - name: Report fetched tests
+      uses: actions/upload-artifact@v3
+      with:
+        name: test_fetched
+        path: test_preparation.txt
+    - id: set_matrix
+      name: Create Test Matrix
+      # The `keys` is used as GitHub actions matrix for jobs, i.e. `models`, `pipelines`, etc.
+      # The `test_map` is used to get the actual identified test files under each key.
+      # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
+      run: |
+        if [ -f test_map.json ]; then
+            keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(json.dumps(d))')
+            test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(json.dumps(test_map))')
+        else
+            keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
+            test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
+        fi
+        echo $keys
+        echo $test_map
+        echo "matrix=$keys" >> $GITHUB_OUTPUT
+        echo "test_map=$test_map" >> $GITHUB_OUTPUT
+
+  run_pr_tests:
+    name: Run PR Tests
+    needs: setup_pr_tests
+    if: contains(fromJson(needs.setup_pr_tests.outputs.matrix), 'dummy') != true
+    strategy:
+      fail-fast: false
+      max-parallel: 2
+      matrix:
+        modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
+    runs-on: docker-cpu
+    container:
+      image: diffusers/diffusers-pytorch-cpu
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
+        python -m pip install -e .[quality,test]
+        python -m pip install accelerate
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run all selected tests on CPU
+      run: |
+        python -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.modules }}_tests_cpu ${{ fromJson(needs.setup_pr_tests.outputs.test_map)[matrix.modules] }}
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      continue-on-error: true
+      run: |
+        cat reports/${{ matrix.modules }}_tests_cpu_stats.txt
+        cat reports/${{ matrix.modules }}_tests_cpu_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v3
+      with:
+          name: ${{ matrix.modules }}_test_reports
+          path: reports
+
+  run_staging_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Hub tests for models, schedulers, and pipelines
+            framework: hub_tests_pytorch
+            runner: docker-cpu
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_hub
+
+    name: ${{ matrix.config.name }}
+    runs-on: ${{ matrix.config.runner }}
+    container:
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
+        python -m pip install -e .[quality,test]
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run Hub tests for models, schedulers, and pipelines on a staging env
+      if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
+      run: |
+        HUGGINGFACE_CO_STAGING=true python -m pytest \
+          -m "is_staging_test" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: pr_${{ matrix.config.report }}_test_reports
+        path: reports
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -113,9 +113,10 @@ jobs:
    - name: Run example PyTorch CPU tests
      if: ${{ matrix.config.framework == 'pytorch_examples' }}
      run: |
+        python -m pip install peft
        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
-          examples/test_examples.py
+          examples

    - name: Failure short reports
      if: ${{ failure() }}
--- a/.github/workflows/push_tests_fast.yml
+++ b/.github/workflows/push_tests_fast.yml
@@ -5,6 +5,10 @@ on:
    branches:
      - main

+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 env:
  DIFFUSERS_IS_CI: yes
  HF_HOME: /mnt/cache
@@ -94,9 +98,10 @@ jobs:
    - name: Run example PyTorch CPU tests
      if: ${{ matrix.config.framework == 'pytorch_examples' }}
      run: |
+        python -m pip install peft
        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
-          examples/test_examples.py 
+          examples

    - name: Failure short reports
      if: ${{ failure() }}
--- a/.github/workflows/push_tests_mps.yml
+++ b/.github/workflows/push_tests_mps.yml
@@ -13,6 +13,10 @@ env:
  PYTEST_TIMEOUT: 600
  RUN_SLOW: no

+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  run_fast_tests_apple_m1:
    name: Fast PyTorch MPS tests on MacOS
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -355,7 +355,7 @@ You will need basic `git` proficiency to be able to contribute to
 manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
 Git](https://git-scm.com/book/en/v2) is a very good reference.

-Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/main/setup.py#L244)):
+Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/main/setup.py#L265)):

 1. Fork the [repository](https://github.com/huggingface/diffusers) by
 clicking on the 'Fork' button on the repository's page. This creates a copy of the code
--- a/4
+++ b/4
@@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src

-check_dirs := examples scripts src tests utils
+check_dirs := examples scripts src tests utils benchmarks

 modified_only_fixup:
 	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
@@ -41,7 +41,7 @@ repo-consistency:

 quality:
 	ruff check $(check_dirs) setup.py
-	ruff format --check $(check_dirs) setup.py 
+	ruff format --check $(check_dirs) setup.py
 	python utils/check_doc_toc.py

 # Format source code automatically and check is there are any problems left that need manual fixing
--- a/PHILOSOPHY.md
+++ b/PHILOSOPHY.md
@@ -82,7 +82,7 @@ Models are designed as configurable toolboxes that are natural extensions of [Py
 The following design principles are followed:
 - Models correspond to **a type of model architecture**. *E.g.* the [`UNet2DConditionModel`] class is used for all UNet variations that expect 2D image inputs and are conditioned on some context.
 - All models can be found in [`src/diffusers/models`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and every model architecture shall be defined in its file, e.g. [`unet_2d_condition.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py), [`transformer_2d.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformer_2d.py), etc...
- Models **do not** follow the single-file policy and should make use of smaller model building blocks, such as [`attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py), [`resnet.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py), [`embeddings.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py), etc... **Note**: This is in stark contrast to Transformers' modelling files and shows that models do not really follow the single-file policy.
+- Models **do not** follow the single-file policy and should make use of smaller model building blocks, such as [`attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py), [`resnet.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py), [`embeddings.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py), etc... **Note**: This is in stark contrast to Transformers' modeling files and shows that models do not really follow the single-file policy.
 - Models intend to expose complexity, just like PyTorch's `Module` class, and give clear error messages.
 - Models all inherit from `ModelMixin` and `ConfigMixin`.
 - Models can be optimized for performance when it doesn’t demand major code changes, keep backward compatibility, and give significant memory or compute gain.
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi

 ## Quickstart

-Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 15000+ checkpoints):
+Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 16000+ checkpoints):

 ```python
 from diffusers import DiffusionPipeline
@@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
 - https://github.com/deep-floyd/IF
 - https://github.com/bentoml/BentoML
 - https://github.com/bmaltais/kohya_ss
- +6000 other amazing GitHub repositories 💪
+- +7000 other amazing GitHub repositories 💪

 Thank you for using us ❤️.

--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -0,0 +1,316 @@
+import os
+import sys
+
+import torch
+
+from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    ControlNetModel,
+    LCMScheduler,
+    StableDiffusionAdapterPipeline,
+    StableDiffusionControlNetPipeline,
+    StableDiffusionXLAdapterPipeline,
+    StableDiffusionXLControlNetPipeline,
+    T2IAdapter,
+    WuerstchenCombinedPipeline,
+)
+from diffusers.utils import load_image
+
+
+sys.path.append(".")
+
+from utils import (  # noqa: E402
+    BASE_PATH,
+    PROMPT,
+    BenchmarkInfo,
+    benchmark_fn,
+    bytes_to_giga_bytes,
+    flush,
+    generate_csv_dict,
+    write_to_csv,
+)
+
+
+RESOLUTION_MAPPING = {
+    "runwayml/stable-diffusion-v1-5": (512, 512),
+    "lllyasviel/sd-controlnet-canny": (512, 512),
+    "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024),
+    "TencentARC/t2iadapter_canny_sd14v1": (512, 512),
+    "TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024),
+    "stabilityai/stable-diffusion-2-1": (768, 768),
+    "stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024),
+    "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
+    "stabilityai/sdxl-turbo": (512, 512),
+}
+
+
+class BaseBenchmak:
+    pipeline_class = None
+
+    def __init__(self, args):
+        super().__init__()
+
+    def run_inference(self, args):
+        raise NotImplementedError
+
+    def benchmark(self, args):
+        raise NotImplementedError
+
+    def get_result_filepath(self, args):
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        name = (
+            args.ckpt.replace("/", "_")
+            + "_"
+            + pipeline_class_name
+            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+        )
+        filepath = os.path.join(BASE_PATH, name)
+        return filepath
+
+
+class TextToImageBenchmark(BaseBenchmak):
+    pipeline_class = AutoPipelineForText2Image
+
+    def __init__(self, args):
+        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
+        pipe = pipe.to("cuda")
+
+        if args.run_compile:
+            if not isinstance(pipe, WuerstchenCombinedPipeline):
+                pipe.unet.to(memory_format=torch.channels_last)
+                print("Run torch compile")
+                pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+                if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None:
+                    pipe.movq.to(memory_format=torch.channels_last)
+                    pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True)
+            else:
+                print("Run torch compile")
+                pipe.decoder = torch.compile(pipe.decoder, mode="reduce-overhead", fullgraph=True)
+                pipe.vqgan = torch.compile(pipe.vqgan, mode="reduce-overhead", fullgraph=True)
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+    def benchmark(self, args):
+        flush()
+
+        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
+
+        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
+        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
+        benchmark_info = BenchmarkInfo(time=time, memory=memory)
+
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        flush()
+        csv_dict = generate_csv_dict(
+            pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
+        )
+        filepath = self.get_result_filepath(args)
+        write_to_csv(filepath, csv_dict)
+        print(f"Logs written to: {filepath}")
+        flush()
+
+
+class TurboTextToImageBenchmark(TextToImageBenchmark):
+    def __init__(self, args):
+        super().__init__(args)
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=0.0,
+        )
+
+
+class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
+    lora_id = "latent-consistency/lcm-lora-sdxl"
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.pipe.load_lora_weights(self.lora_id)
+        self.pipe.fuse_lora()
+        self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
+
+    def get_result_filepath(self, args):
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        name = (
+            self.lora_id.replace("/", "_")
+            + "_"
+            + pipeline_class_name
+            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+        )
+        filepath = os.path.join(BASE_PATH, name)
+        return filepath
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=1.0,
+        )
+
+    def benchmark(self, args):
+        flush()
+
+        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
+
+        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
+        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
+        benchmark_info = BenchmarkInfo(time=time, memory=memory)
+
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        flush()
+        csv_dict = generate_csv_dict(
+            pipeline_cls=pipeline_class_name, ckpt=self.lora_id, args=args, benchmark_info=benchmark_info
+        )
+        filepath = self.get_result_filepath(args)
+        write_to_csv(filepath, csv_dict)
+        print(f"Logs written to: {filepath}")
+        flush()
+
+
+class ImageToImageBenchmark(TextToImageBenchmark):
+    pipeline_class = AutoPipelineForImage2Image
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg"
+    image = load_image(url).convert("RGB")
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+
+class TurboImageToImageBenchmark(ImageToImageBenchmark):
+    def __init__(self, args):
+        super().__init__(args)
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=0.0,
+            strength=0.5,
+        )
+
+
+class InpaintingBenchmark(ImageToImageBenchmark):
+    pipeline_class = AutoPipelineForInpainting
+    mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png"
+    mask = load_image(mask_url).convert("RGB")
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+        self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            mask_image=self.mask,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+
+class ControlNetBenchmark(TextToImageBenchmark):
+    pipeline_class = StableDiffusionControlNetPipeline
+    aux_network_class = ControlNetModel
+    root_ckpt = "runwayml/stable-diffusion-v1-5"
+
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
+    image = load_image(url).convert("RGB")
+
+    def __init__(self, args):
+        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
+        pipe = pipe.to("cuda")
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+        if args.run_compile:
+            pipe.unet.to(memory_format=torch.channels_last)
+            pipe.controlnet.to(memory_format=torch.channels_last)
+
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+
+class ControlNetSDXLBenchmark(ControlNetBenchmark):
+    pipeline_class = StableDiffusionXLControlNetPipeline
+    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+
+    def __init__(self, args):
+        super().__init__(args)
+
+
+class T2IAdapterBenchmark(ControlNetBenchmark):
+    pipeline_class = StableDiffusionAdapterPipeline
+    aux_network_class = T2IAdapter
+    root_ckpt = "CompVis/stable-diffusion-v1-4"
+
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
+    image = load_image(url).convert("L")
+
+    def __init__(self, args):
+        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16)
+        pipe = pipe.to("cuda")
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+        if args.run_compile:
+            pipe.unet.to(memory_format=torch.channels_last)
+            pipe.adapter.to(memory_format=torch.channels_last)
+
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True)
+
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+
+
+class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark):
+    pipeline_class = StableDiffusionXLAdapterPipeline
+    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter_sdxl.png"
+    image = load_image(url)
+
+    def __init__(self, args):
+        super().__init__(args)
--- a/benchmarks/benchmark_controlnet.py
+++ b/benchmarks/benchmark_controlnet.py
@@ -0,0 +1,26 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="lllyasviel/sd-controlnet-canny",
+        choices=["lllyasviel/sd-controlnet-canny", "diffusers/controlnet-canny-sdxl-1.0"],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = (
+        ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args)
+    )
+    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_sd_img.py
+++ b/benchmarks/benchmark_sd_img.py
@@ -0,0 +1,29 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import ImageToImageBenchmark, TurboImageToImageBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        choices=[
+            "runwayml/stable-diffusion-v1-5",
+            "stabilityai/stable-diffusion-2-1",
+            "stabilityai/stable-diffusion-xl-refiner-1.0",
+            "stabilityai/sdxl-turbo",
+        ],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = ImageToImageBenchmark(args) if "turbo" not in args.ckpt else TurboImageToImageBenchmark(args)
+    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_sd_inpainting.py
+++ b/benchmarks/benchmark_sd_inpainting.py
@@ -0,0 +1,28 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import InpaintingBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        choices=[
+            "runwayml/stable-diffusion-v1-5",
+            "stabilityai/stable-diffusion-2-1",
+            "stabilityai/stable-diffusion-xl-base-1.0",
+        ],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = InpaintingBenchmark(args)
+    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_t2i_adapter.py
+++ b/benchmarks/benchmark_t2i_adapter.py
@@ -0,0 +1,28 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import T2IAdapterBenchmark, T2IAdapterSDXLBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="TencentARC/t2iadapter_canny_sd14v1",
+        choices=["TencentARC/t2iadapter_canny_sd14v1", "TencentARC/t2i-adapter-canny-sdxl-1.0"],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = (
+        T2IAdapterBenchmark(args)
+        if args.ckpt == "TencentARC/t2iadapter_canny_sd14v1"
+        else T2IAdapterSDXLBenchmark(args)
+    )
+    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_t2i_lcm_lora.py
+++ b/benchmarks/benchmark_t2i_lcm_lora.py
@@ -0,0 +1,23 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import LCMLoRATextToImageBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="stabilityai/stable-diffusion-xl-base-1.0",
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=4)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = LCMLoRATextToImageBenchmark(args)
+    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_text_to_image.py
+++ b/benchmarks/benchmark_text_to_image.py
@@ -0,0 +1,40 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark  # noqa: E402
+
+
+ALL_T2I_CKPTS = [
+    "runwayml/stable-diffusion-v1-5",
+    "segmind/SSD-1B",
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    "kandinsky-community/kandinsky-2-2-decoder",
+    "warp-ai/wuerstchen",
+    "stabilityai/sdxl-turbo",
+]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        choices=ALL_T2I_CKPTS,
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_cls = None
+    if "turbo" in args.ckpt:
+        benchmark_cls = TurboTextToImageBenchmark
+    else:
+        benchmark_cls = TextToImageBenchmark
+
+    benchmark_pipe = benchmark_cls(args)
+    benchmark_pipe.benchmark(args)
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -0,0 +1,72 @@
+import glob
+import sys
+
+import pandas as pd
+from huggingface_hub import hf_hub_download, upload_file
+from huggingface_hub.utils._errors import EntryNotFoundError
+
+
+sys.path.append(".")
+from utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv  # noqa: E402
+
+
+def has_previous_benchmark() -> str:
+    csv_path = None
+    try:
+        csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE)
+    except EntryNotFoundError:
+        csv_path = None
+    return csv_path
+
+
+def filter_float(value):
+    if isinstance(value, str):
+        return float(value.split()[0])
+    return value
+
+
+def push_to_hf_dataset():
+    all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv"))
+    collate_csv(all_csvs, FINAL_CSV_FILE)
+
+    # If there's an existing benchmark file, we should report the changes.
+    csv_path = has_previous_benchmark()
+    if csv_path is not None:
+        current_results = pd.read_csv(FINAL_CSV_FILE)
+        previous_results = pd.read_csv(csv_path)
+
+        numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns
+        numeric_columns = [
+            c for c in numeric_columns if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)"]
+        ]
+
+        for column in numeric_columns:
+            previous_results[column] = previous_results[column].map(lambda x: filter_float(x))
+
+            # Calculate the percentage change
+            current_results[column] = current_results[column].astype(float)
+            previous_results[column] = previous_results[column].astype(float)
+            percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100
+
+            # Format the values with '+' or '-' sign and append to original values
+            current_results[column] = current_results[column].map(str) + percent_change.map(
+                lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
+            )
+            # There might be newly added rows. So, filter out the NaNs.
+            current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", ""))
+
+        # Overwrite the current result file.
+        current_results.to_csv(FINAL_CSV_FILE, index=False)
+
+    commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results"
+    upload_file(
+        repo_id=REPO_ID,
+        path_in_repo=FINAL_CSV_FILE,
+        path_or_fileobj=FINAL_CSV_FILE,
+        repo_type="dataset",
+        commit_message=commit_message,
+    )
+
+
+if __name__ == "__main__":
+    push_to_hf_dataset()
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -0,0 +1,97 @@
+import glob
+import subprocess
+import sys
+from typing import List
+
+
+sys.path.append(".")
+from benchmark_text_to_image import ALL_T2I_CKPTS  # noqa: E402
+
+
+PATTERN = "benchmark_*.py"
+
+
+class SubprocessCallException(Exception):
+    pass
+
+
+# Taken from `test_examples_utils.py`
+def run_command(command: List[str], return_stdout=False):
+    """
+    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
+    if an error occurred while running `command`
+    """
+    try:
+        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
+        if return_stdout:
+            if hasattr(output, "decode"):
+                output = output.decode("utf-8")
+            return output
+    except subprocess.CalledProcessError as e:
+        raise SubprocessCallException(
+            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
+        ) from e
+
+
+def main():
+    python_files = glob.glob(PATTERN)
+
+    for file in python_files:
+        print(f"****** Running file: {file} ******")
+
+        # Run with canonical settings.
+        if file != "benchmark_text_to_image.py":
+            command = f"python {file}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
+
+    # Run variants.
+    for file in python_files:
+        if file == "benchmark_text_to_image.py":
+            for ckpt in ALL_T2I_CKPTS:
+                command = f"python {file} --ckpt {ckpt}"
+
+                if "turbo" in ckpt:
+                    command += " --num_inference_steps 1"
+
+                run_command(command.split())
+
+                command += " --run_compile"
+                run_command(command.split())
+
+        elif file == "benchmark_sd_img.py":
+            for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]:
+                command = f"python {file} --ckpt {ckpt}"
+
+                if ckpt == "stabilityai/sdxl-turbo":
+                    command += " --num_inference_steps 2"
+
+                run_command(command.split())
+                command += " --run_compile"
+                run_command(command.split())
+
+        elif file == "benchmark_sd_inpainting.py":
+            sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+            command = f"python {file} --ckpt {sdxl_ckpt}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
+
+        elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]:
+            sdxl_ckpt = (
+                "diffusers/controlnet-canny-sdxl-1.0"
+                if "controlnet" in file
+                else "TencentARC/t2i-adapter-canny-sdxl-1.0"
+            )
+            command = f"python {file} --ckpt {sdxl_ckpt}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -0,0 +1,98 @@
+import argparse
+import csv
+import gc
+import os
+from dataclasses import dataclass
+from typing import Dict, List, Union
+
+import torch
+import torch.utils.benchmark as benchmark
+
+
+GITHUB_SHA = os.getenv("GITHUB_SHA", None)
+BENCHMARK_FIELDS = [
+    "pipeline_cls",
+    "ckpt_id",
+    "batch_size",
+    "num_inference_steps",
+    "model_cpu_offload",
+    "run_compile",
+    "time (secs)",
+    "memory (gbs)",
+    "actual_gpu_memory (gbs)",
+    "github_sha",
+]
+
+PROMPT = "ghibli style, a fantasy landscape with castles"
+BASE_PATH = os.getenv("BASE_PATH", ".")
+TOTAL_GPU_MEMORY = float(os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3)))
+
+REPO_ID = "diffusers/benchmarks"
+FINAL_CSV_FILE = "collated_results.csv"
+
+
+@dataclass
+class BenchmarkInfo:
+    time: float
+    memory: float
+
+
+def flush():
+    """Wipes off memory."""
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.reset_peak_memory_stats()
+
+
+def bytes_to_giga_bytes(bytes):
+    return f"{(bytes / 1024 / 1024 / 1024):.3f}"
+
+
+def benchmark_fn(f, *args, **kwargs):
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "f": f},
+        num_threads=torch.get_num_threads(),
+    )
+    return f"{(t0.blocked_autorange().mean):.3f}"
+
+
+def generate_csv_dict(
+    pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo
+) -> Dict[str, Union[str, bool, float]]:
+    """Packs benchmarking data into a dictionary for latter serialization."""
+    data_dict = {
+        "pipeline_cls": pipeline_cls,
+        "ckpt_id": ckpt,
+        "batch_size": args.batch_size,
+        "num_inference_steps": args.num_inference_steps,
+        "model_cpu_offload": args.model_cpu_offload,
+        "run_compile": args.run_compile,
+        "time (secs)": benchmark_info.time,
+        "memory (gbs)": benchmark_info.memory,
+        "actual_gpu_memory (gbs)": f"{(TOTAL_GPU_MEMORY):.3f}",
+        "github_sha": GITHUB_SHA,
+    }
+    return data_dict
+
+
+def write_to_csv(file_name: str, data_dict: Dict[str, Union[str, bool, float]]):
+    """Serializes a dictionary into a CSV file."""
+    with open(file_name, mode="w", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS)
+        writer.writeheader()
+        writer.writerow(data_dict)
+
+
+def collate_csv(input_files: List[str], output_file: str):
+    """Collates multiple identically structured CSVs into a single CSV file."""
+    with open(output_file, mode="w", newline="") as outfile:
+        writer = csv.DictWriter(outfile, fieldnames=BENCHMARK_FIELDS)
+        writer.writeheader()
+
+        for file in input_files:
+            with open(file, mode="r") as infile:
+                reader = csv.DictReader(infile)
+                for row in reader:
+                    writer.writerow(row)
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -19,6 +19,8 @@
    title: Train a diffusion model
  - local: tutorials/using_peft_for_inference
    title: Inference with PEFT
+  - local: tutorials/fast_diffusion
+    title: Accelerate inference of text-to-image diffusion models
  title: Tutorials
 - sections:
  - sections:
@@ -72,6 +74,8 @@
      title: Overview
    - local: using-diffusers/sdxl
      title: Stable Diffusion XL
+    - local: using-diffusers/sdxl_turbo
+      title: SDXL Turbo
    - local: using-diffusers/kandinsky
      title: Kandinsky
    - local: using-diffusers/controlnet
@@ -94,6 +98,8 @@
      title: Latent Consistency Model-LoRA
    - local: using-diffusers/inference_with_lcm
      title: Latent Consistency Model
+    - local: using-diffusers/svd
+      title: Stable Video Diffusion
    title: Specific pipeline examples
  - sections:
    - local: training/overview
@@ -129,6 +135,8 @@
        title: LoRA
      - local: training/custom_diffusion
        title: Custom Diffusion
+      - local: training/lcm_distill
+        title: Latent Consistency Distillation
      - local: training/ddpo
        title: Reinforcement learning training with DDPO
      title: Methods
@@ -192,6 +200,8 @@
      title: Outputs
    title: Main Classes
  - sections:
+    - local: api/loaders/ip_adapter
+      title: IP-Adapter
    - local: api/loaders/lora
      title: LoRA
    - local: api/loaders/single_file
@@ -236,14 +246,12 @@
  - sections:
    - local: api/pipelines/overview
      title: Overview
-    - local: api/pipelines/alt_diffusion
-      title: AltDiffusion
+    - local: api/pipelines/amused
+      title: aMUSEd
    - local: api/pipelines/animatediff
      title: AnimateDiff
    - local: api/pipelines/attend_and_excite
      title: Attend-and-Excite
-    - local: api/pipelines/audio_diffusion
-      title: Audio Diffusion
    - local: api/pipelines/audioldm
      title: AudioLDM
    - local: api/pipelines/audioldm2
@@ -258,8 +266,6 @@
      title: ControlNet
    - local: api/pipelines/controlnet_sdxl
      title: ControlNet with Stable Diffusion XL
-    - local: api/pipelines/cycle_diffusion
-      title: Cycle Diffusion
    - local: api/pipelines/dance_diffusion
      title: Dance Diffusion
    - local: api/pipelines/ddim
@@ -278,6 +284,8 @@
      title: Kandinsky 2.1
    - local: api/pipelines/kandinsky_v22
      title: Kandinsky 2.2
+    - local: api/pipelines/kandinsky3
+      title: Kandinsky 3
    - local: api/pipelines/latent_consistency_models
      title: Latent Consistency Models
    - local: api/pipelines/latent_diffusion
@@ -288,26 +296,14 @@
      title: MusicLDM
    - local: api/pipelines/paint_by_example
      title: Paint by Example
-    - local: api/pipelines/paradigms
-      title: Parallel Sampling of Diffusion Models
-    - local: api/pipelines/pix2pix_zero
-      title: Pix2Pix Zero
    - local: api/pipelines/pixart
      title: PixArt-α
-    - local: api/pipelines/pndm
-      title: PNDM
-    - local: api/pipelines/repaint
-      title: RePaint
-    - local: api/pipelines/score_sde_ve
-      title: Score SDE VE
    - local: api/pipelines/self_attention_guidance
      title: Self-Attention Guidance
    - local: api/pipelines/semantic_stable_diffusion
      title: Semantic Guidance
    - local: api/pipelines/shap_e
      title: Shap-E
-    - local: api/pipelines/spectrogram_diffusion
-      title: Spectrogram Diffusion
    - sections:
      - local: api/pipelines/stable_diffusion/overview
        title: Overview
@@ -327,12 +323,14 @@
        title: Stable Diffusion 2
      - local: api/pipelines/stable_diffusion/stable_diffusion_xl
        title: Stable Diffusion XL
+      - local: api/pipelines/stable_diffusion/sdxl_turbo
+        title: SDXL Turbo
      - local: api/pipelines/stable_diffusion/latent_upscale
        title: Latent upscaler
      - local: api/pipelines/stable_diffusion/upscale
        title: Super-resolution
      - local: api/pipelines/stable_diffusion/ldm3d_diffusion
-        title: LDM3D Text-to-(RGB, Depth)
+        title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
      - local: api/pipelines/stable_diffusion/adapter
        title: Stable Diffusion T2I-Adapter
      - local: api/pipelines/stable_diffusion/gligen
@@ -340,26 +338,16 @@
      title: Stable Diffusion
    - local: api/pipelines/stable_unclip
      title: Stable unCLIP
-    - local: api/pipelines/stochastic_karras_ve
-      title: Stochastic Karras VE
-    - local: api/pipelines/model_editing
-      title: Text-to-image model editing
    - local: api/pipelines/text_to_video
      title: Text-to-video
    - local: api/pipelines/text_to_video_zero
      title: Text2Video-Zero
    - local: api/pipelines/unclip
      title: unCLIP
-    - local: api/pipelines/latent_diffusion_uncond
-      title: Unconditional Latent Diffusion
    - local: api/pipelines/unidiffuser
      title: UniDiffuser
    - local: api/pipelines/value_guided_sampling
      title: Value-guided sampling
-    - local: api/pipelines/versatile_diffusion
-      title: Versatile Diffusion
-    - local: api/pipelines/vq_diffusion
-      title: VQ Diffusion
    - local: api/pipelines/wuerstchen
      title: Wuerstchen
    title: Pipelines
--- a/docs/source/en/api/attnprocessor.md
+++ b/docs/source/en/api/attnprocessor.md
@@ -20,6 +20,9 @@ An attention processor is a class for applying different types of attention mech
 ## AttnProcessor2_0
 [[autodoc]] models.attention_processor.AttnProcessor2_0

+## FusedAttnProcessor2_0
+[[autodoc]] models.attention_processor.FusedAttnProcessor2_0
+
 ## LoRAAttnProcessor
 [[autodoc]] models.attention_processor.LoRAAttnProcessor

--- a/docs/source/en/api/loaders/ip_adapter.md
+++ b/docs/source/en/api/loaders/ip_adapter.md
@@ -0,0 +1,25 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# IP-Adapter
+
+[IP-Adapter](https://hf.co/papers/2308.06721) is a lightweight adapter that enables prompting a diffusion model with an image. This method decouples the cross-attention layers of the image and text features. The image features are generated from an image encoder. Files generated from IP-Adapter are only ~100MBs.
+
+<Tip>
+
+Learn how to load an IP-Adapter checkpoint and image in the [IP-Adapter](../../using-diffusers/loading_adapters#ip-adapter) loading guide.
+
+</Tip>
+
+## IPAdapterMixin
+
+[[autodoc]] loaders.ip_adapter.IPAdapterMixin
--- a/docs/source/en/api/models/asymmetricautoencoderkl.md
+++ b/docs/source/en/api/models/asymmetricautoencoderkl.md
@@ -49,12 +49,12 @@ make_image_grid([original_image, mask_image, image], rows=1, cols=3)

 ## AsymmetricAutoencoderKL

-[[autodoc]] models.autoencoder_asym_kl.AsymmetricAutoencoderKL
+[[autodoc]] models.autoencoders.autoencoder_asym_kl.AsymmetricAutoencoderKL

 ## AutoencoderKLOutput

-[[autodoc]] models.autoencoder_kl.AutoencoderKLOutput
+[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput

 ## DecoderOutput

-[[autodoc]] models.vae.DecoderOutput
+[[autodoc]] models.autoencoders.vae.DecoderOutput
--- a/docs/source/en/api/models/autoencoder_tiny.md
+++ b/docs/source/en/api/models/autoencoder_tiny.md
@@ -54,4 +54,4 @@ image

 ## AutoencoderTinyOutput

-[[autodoc]] models.autoencoder_tiny.AutoencoderTinyOutput
+[[autodoc]] models.autoencoders.autoencoder_tiny.AutoencoderTinyOutput
--- a/docs/source/en/api/models/autoencoderkl.md
+++ b/docs/source/en/api/models/autoencoderkl.md
@@ -36,11 +36,11 @@ model = AutoencoderKL.from_single_file(url)

 ## AutoencoderKLOutput

-[[autodoc]] models.autoencoder_kl.AutoencoderKLOutput
+[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput

 ## DecoderOutput

-[[autodoc]] models.vae.DecoderOutput
+[[autodoc]] models.autoencoders.vae.DecoderOutput

 ## FlaxAutoencoderKL

--- a/docs/source/en/api/pipelines/alt_diffusion.md
+++ b/docs/source/en/api/pipelines/alt_diffusion.md
@@ -1,47 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# AltDiffusion
-
-AltDiffusion was proposed in [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://huggingface.co/papers/2211.06679) by Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, Ledell Wu.
-
-The abstract from the paper is:
-
-*In this work, we present a conceptually simple and effective method to train a strong bilingual/multilingual multimodal representation model. Starting from the pre-trained multimodal representation model CLIP released by OpenAI, we altered its text encoder with a pre-trained multilingual text encoder XLM-R, and aligned both languages and image representations by a two-stage training schema consisting of teacher learning and contrastive learning. We validate our method through evaluations of a wide range of tasks. We set new state-of-the-art performances on a bunch of tasks including ImageNet-CN, Flicker30k-CN, COCO-CN and XTD. Further, we obtain very close performances with CLIP on almost all tasks, suggesting that one can simply alter the text encoder in CLIP for extended capabilities such as multilingual understanding. Our models and code are available at [this https URL](https://github.com/FlagAI-Open/FlagAI).*
-
-## Tips
-
-`AltDiffusion` is conceptually the same as [Stable Diffusion](./stable_diffusion/overview).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## AltDiffusionPipeline
-
-[[autodoc]] AltDiffusionPipeline
-	- all
-	- __call__
-
-## AltDiffusionImg2ImgPipeline
-
-[[autodoc]] AltDiffusionImg2ImgPipeline
-	- all
-	- __call__
-
-## AltDiffusionPipelineOutput
-
-[[autodoc]] pipelines.alt_diffusion.AltDiffusionPipelineOutput
-	- all
-	- __call__
--- a/docs/source/en/api/pipelines/amused.md
+++ b/docs/source/en/api/pipelines/amused.md
@@ -0,0 +1,42 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# aMUSEd
+
+Amused is a lightweight text to image model based off of the [muse](https://arxiv.org/pdf/2301.00704.pdf) architecture. Amused is particularly useful in applications that require a lightweight and fast model such as generating many images quickly at once.
+
+Amused is a vqvae token based transformer that can generate an image in fewer forward passes than many diffusion models. In contrast with muse, it uses the smaller text encoder CLIP-L/14 instead of t5-xxl. Due to its small parameter count and few forward pass generation process, amused can generate many images quickly. This benefit is seen particularly at larger batch sizes. 
+
+| Model | Params |
+|-------|--------|
+| [amused-256](https://huggingface.co/amused/amused-256) | 603M |
+| [amused-512](https://huggingface.co/amused/amused-512) | 608M |
+
+## AmusedPipeline
+
+[[autodoc]] AmusedPipeline
+	- __call__
+	- all
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
+
+[[autodoc]] AmusedImg2ImgPipeline
+	- __call__
+	- all
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
+
+[[autodoc]] AmusedInpaintPipeline
+	- __call__
+	- all
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -38,16 +38,21 @@ The following example demonstrates how to use a *MotionAdapter* checkpoint with

 ```python
 import torch
-from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
 from diffusers.utils import export_to_gif

 # Load the motion adapter
-adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
 # load SD 1.5 based finetuned model
 model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
-pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter)
+pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)
 scheduler = DDIMScheduler.from_pretrained(
-    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1
+    model_id,
+    subfolder="scheduler",
+    clip_sample=False,
+    timestep_spacing="linspace",
+    beta_schedule="linear",
+    steps_offset=1,
 )
 pipe.scheduler = scheduler

@@ -70,6 +75,7 @@ output = pipe(
 )
 frames = output.frames[0]
 export_to_gif(frames, "animation.gif")
+
 ```

 Here are some sample outputs:
@@ -88,7 +94,7 @@ Here are some sample outputs:

 <Tip>

-AnimateDiff tends to work better with finetuned Stable Diffusion models. If you plan on using a scheduler that can clip samples, make sure to disable it by setting `clip_sample=False` in the scheduler as this can also have an adverse effect on generated samples.
+AnimateDiff tends to work better with finetuned Stable Diffusion models. If you plan on using a scheduler that can clip samples, make sure to disable it by setting `clip_sample=False` in the scheduler as this can also have an adverse effect on generated samples. Additionally, the AnimateDiff checkpoints can be sensitive to the beta schedule of the scheduler. We recommend setting this to `linear`.

 </Tip>

@@ -98,18 +104,25 @@ Motion LoRAs are a collection of LoRAs that work with the `guoyww/animatediff-mo

 ```python
 import torch
-from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
 from diffusers.utils import export_to_gif

 # Load the motion adapter
-adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
 # load SD 1.5 based finetuned model
 model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
-pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter)
-pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-out", adapter_name="zoom-out")
+pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)
+pipe.load_lora_weights(
+    "guoyww/animatediff-motion-lora-zoom-out", adapter_name="zoom-out"
+)

 scheduler = DDIMScheduler.from_pretrained(
-    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1
+    model_id,
+    subfolder="scheduler",
+    clip_sample=False,
+    beta_schedule="linear",
+    timestep_spacing="linspace",
+    steps_offset=1,
 )
 pipe.scheduler = scheduler

@@ -132,6 +145,7 @@ output = pipe(
 )
 frames = output.frames[0]
 export_to_gif(frames, "animation.gif")
+
 ```

 <table>
@@ -160,21 +174,30 @@ Then you can use the following code to combine Motion LoRAs.

 ```python
 import torch
-from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
 from diffusers.utils import export_to_gif

 # Load the motion adapter
-adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
 # load SD 1.5 based finetuned model
 model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
-pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter)
+pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)

-pipe.load_lora_weights("diffusers/animatediff-motion-lora-zoom-out", adapter_name="zoom-out")
-pipe.load_lora_weights("diffusers/animatediff-motion-lora-pan-left", adapter_name="pan-left")
+pipe.load_lora_weights(
+    "diffusers/animatediff-motion-lora-zoom-out", adapter_name="zoom-out",
+)
+pipe.load_lora_weights(
+    "diffusers/animatediff-motion-lora-pan-left", adapter_name="pan-left",
+)
 pipe.set_adapters(["zoom-out", "pan-left"], adapter_weights=[1.0, 1.0])

 scheduler = DDIMScheduler.from_pretrained(
-    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1
+    model_id,
+    subfolder="scheduler",
+    clip_sample=False,
+    timestep_spacing="linspace",
+    beta_schedule="linear",
+    steps_offset=1,
 )
 pipe.scheduler = scheduler

@@ -197,6 +220,7 @@ output = pipe(
 )
 frames = output.frames[0]
 export_to_gif(frames, "animation.gif")
+
 ```

 <table>
--- a/docs/source/en/api/pipelines/audio_diffusion.md
+++ b/docs/source/en/api/pipelines/audio_diffusion.md
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Audio Diffusion
-
-[Audio Diffusion](https://github.com/teticio/audio-diffusion) is by Robert Dargavel Smith, and it leverages the recent advances in image generation from diffusion models by converting audio samples to and from Mel spectrogram images.
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## AudioDiffusionPipeline
-[[autodoc]] AudioDiffusionPipeline
-	- all
-	- __call__
-
-## AudioPipelineOutput
-[[autodoc]] pipelines.AudioPipelineOutput
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
-
-## Mel
-[[autodoc]] Mel
--- a/docs/source/en/api/pipelines/cycle_diffusion.md
+++ b/docs/source/en/api/pipelines/cycle_diffusion.md
@@ -1,33 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Cycle Diffusion
-
-Cycle Diffusion is a text guided image-to-image generation model proposed in [Unifying Diffusion Models' Latent Space, with Applications to CycleDiffusion and Guidance](https://huggingface.co/papers/2210.05559) by Chen Henry Wu, Fernando De la Torre.
-
-The abstract from the paper is:
-
-*Diffusion models have achieved unprecedented performance in generative modeling. The commonly-adopted formulation of the latent code of diffusion models is a sequence of gradually denoised samples, as opposed to the simpler (e.g., Gaussian) latent space of GANs, VAEs, and normalizing flows. This paper provides an alternative, Gaussian formulation of the latent space of various diffusion models, as well as an invertible DPM-Encoder that maps images into the latent space. While our formulation is purely based on the definition of diffusion models, we demonstrate several intriguing consequences. (1) Empirically, we observe that a common latent space emerges from two diffusion models trained independently on related domains. In light of this finding, we propose CycleDiffusion, which uses DPM-Encoder for unpaired image-to-image translation. Furthermore, applying CycleDiffusion to text-to-image diffusion models, we show that large-scale text-to-image diffusion models can be used as zero-shot image-to-image editors. (2) One can guide pre-trained diffusion models and GANs by controlling the latent codes in a unified, plug-and-play formulation based on energy-based models. Using the CLIP model and a face recognition model as guidance, we demonstrate that diffusion models have better coverage of low-density sub-populations and individuals than GANs. The code is publicly available at [this https URL](https://github.com/ChenWu98/cycle-diffusion).*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## CycleDiffusionPipeline
-[[autodoc]] CycleDiffusionPipeline
-	- all
-	- __call__
-
-## StableDiffusionPiplineOutput
-[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
--- a/docs/source/en/api/pipelines/kandinsky3.md
+++ b/docs/source/en/api/pipelines/kandinsky3.md
@@ -0,0 +1,49 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Kandinsky 3
+
+Kandinsky 3 is created by [Vladimir Arkhipkin](https://github.com/oriBetelgeuse),[Anastasia Maltseva](https://github.com/NastyaMittseva),[Igor Pavlov](https://github.com/boomb0om),[Andrei Filatov](https://github.com/anvilarth),[Arseniy Shakhmatov](https://github.com/cene555),[Andrey Kuznetsov](https://github.com/kuznetsoffandrey),[Denis Dimitrov](https://github.com/denndimitrov), [Zein Shaheen](https://github.com/zeinsh)
+
+The description from it's Github page: 
+
+*Kandinsky 3.0 is an open-source text-to-image diffusion model built upon the Kandinsky2-x model family. In comparison to its predecessors, enhancements have been made to the text understanding and visual quality of the model, achieved by increasing the size of the text encoder and Diffusion U-Net models, respectively.*
+
+Its architecture includes 3 main components:
+1. [FLAN-UL2](https://huggingface.co/google/flan-ul2), which is an encoder decoder model based on the T5 architecture. 
+2. New U-Net architecture featuring BigGAN-deep blocks doubles depth while maintaining the same number of parameters.
+3. Sber-MoVQGAN is a decoder proven to have superior results in image restoration.
+
+
+
+The original codebase can be found at [ai-forever/Kandinsky-3](https://github.com/ai-forever/Kandinsky-3).
+
+<Tip>
+
+Check out the [Kandinsky Community](https://huggingface.co/kandinsky-community) organization on the Hub for the official model checkpoints for tasks like text-to-image, image-to-image, and inpainting.
+
+</Tip>
+
+<Tip>
+
+Make sure to check out the schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## Kandinsky3Pipeline
+
+[[autodoc]] Kandinsky3Pipeline
+	- all
+	- __call__
+
+## Kandinsky3Img2ImgPipeline
+
+[[autodoc]] Kandinsky3Img2ImgPipeline
+	- all
+	- __call__
--- a/docs/source/en/api/pipelines/latent_diffusion_uncond.md
+++ b/docs/source/en/api/pipelines/latent_diffusion_uncond.md
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Unconditional Latent Diffusion
-
-Unconditional Latent Diffusion was proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://huggingface.co/papers/2112.10752) by Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, Björn Ommer.
-
-The abstract from the paper is:
-
-*By decomposing the image formation process into a sequential application of denoising autoencoders, diffusion models (DMs) achieve state-of-the-art synthesis results on image data and beyond. Additionally, their formulation allows for a guiding mechanism to control the image generation process without retraining. However, since these models typically operate directly in pixel space, optimization of powerful DMs often consumes hundreds of GPU days and inference is expensive due to sequential evaluations. To enable DM training on limited computational resources while retaining their quality and flexibility, we apply them in the latent space of powerful pretrained autoencoders. In contrast to previous work, training diffusion models on such a representation allows for the first time to reach a near-optimal point between complexity reduction and detail preservation, greatly boosting visual fidelity. By introducing cross-attention layers into the model architecture, we turn diffusion models into powerful and flexible generators for general conditioning inputs such as text or bounding boxes and high-resolution synthesis becomes possible in a convolutional manner. Our latent diffusion models (LDMs) achieve a new state of the art for image inpainting and highly competitive performance on various tasks, including unconditional image generation, semantic scene synthesis, and super-resolution, while significantly reducing computational requirements compared to pixel-based DMs.*
-
-The original codebase can be found at [CompVis/latent-diffusion](https://github.com/CompVis/latent-diffusion).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## LDMPipeline
-[[autodoc]] LDMPipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
--- a/docs/source/en/api/pipelines/model_editing.md
+++ b/docs/source/en/api/pipelines/model_editing.md
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Text-to-image model editing
-
-[Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://huggingface.co/papers/2303.08084) is by Hadas Orgad, Bahjat Kawar, and Yonatan Belinkov. This pipeline enables editing diffusion model weights, such that its assumptions of a given concept are changed. The resulting change is expected to take effect in all prompt generations related to the edited concept.
-
-The abstract from the paper is:
-
-*Text-to-image diffusion models often make implicit assumptions about the world when generating images. While some assumptions are useful (e.g., the sky is blue), they can also be outdated, incorrect, or reflective of social biases present in the training data. Thus, there is a need to control these assumptions without requiring explicit user input or costly re-training. In this work, we aim to edit a given implicit assumption in a pre-trained diffusion model. Our Text-to-Image Model Editing method, TIME for short, receives a pair of inputs: a "source" under-specified prompt for which the model makes an implicit assumption (e.g., "a pack of roses"), and a "destination" prompt that describes the same setting, but with a specified desired attribute (e.g., "a pack of blue roses"). TIME then updates the model's cross-attention layers, as these layers assign visual meaning to textual tokens. We edit the projection matrices in these layers such that the source prompt is projected close to the destination prompt. Our method is highly efficient, as it modifies a mere 2.2% of the model's parameters in under one second. To evaluate model editing approaches, we introduce TIMED (TIME Dataset), containing 147 source and destination prompt pairs from various domains. Our experiments (using Stable Diffusion) show that TIME is successful in model editing, generalizes well for related prompts unseen during editing, and imposes minimal effect on unrelated generations.*
-
-You can find additional information about model editing on the [project page](https://time-diffusion.github.io/), [original codebase](https://github.com/bahjat-kawar/time-diffusion), and try it out in a [demo](https://huggingface.co/spaces/bahjat-kawar/time-diffusion).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## StableDiffusionModelEditingPipeline
-[[autodoc]] StableDiffusionModelEditingPipeline
-	- __call__
-	- all
-
-## StableDiffusionPipelineOutput
-[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
--- a/docs/source/en/api/pipelines/overview.md
+++ b/docs/source/en/api/pipelines/overview.md
@@ -40,6 +40,8 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Consistency Models](consistency_models) | unconditional image generation |
 | [ControlNet](controlnet) | text2image, image2image, inpainting |
 | [ControlNet with Stable Diffusion XL](controlnet_sdxl) | text2image |
+| [ControlNet-XS](controlnetxs) | text2image |
+| [ControlNet-XS with Stable Diffusion XL](controlnetxs_sdxl) | text2image |
 | [Cycle Diffusion](cycle_diffusion) | image2image |
 | [Dance Diffusion](dance_diffusion) | unconditional audio generation |
 | [DDIM](ddim) | unconditional image generation |
@@ -51,9 +53,10 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [InstructPix2Pix](pix2pix) | image editing |
 | [Kandinsky 2.1](kandinsky) | text2image, image2image, inpainting, interpolation |
 | [Kandinsky 2.2](kandinsky_v22) | text2image, image2image, inpainting |
+| [Kandinsky 3](kandinsky3) | text2image, image2image |
 | [Latent Consistency Models](latent_consistency_models) | text2image |
 | [Latent Diffusion](latent_diffusion) | text2image, super-resolution |
-| [LDM3D](stable_diffusion/ldm3d_diffusion) | text2image, text-to-3D |
+| [LDM3D](stable_diffusion/ldm3d_diffusion) | text2image, text-to-3D, text-to-pano, upscaling |
 | [MultiDiffusion](panorama) | text2image |
 | [MusicLDM](musicldm) | text2audio |
 | [Paint by Example](paint_by_example) | inpainting |
@@ -70,6 +73,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Stable Diffusion](stable_diffusion/overview) | text2image, image2image, depth2image, inpainting, image variation, latent upscaler, super-resolution |
 | [Stable Diffusion Model Editing](model_editing) | model editing |
 | [Stable Diffusion XL](stable_diffusion/stable_diffusion_xl) | text2image, image2image, inpainting |
+| [Stable Diffusion XL Turbo](stable_diffusion/sdxl_turbo) | text2image, image2image, inpainting |
 | [Stable unCLIP](stable_unclip) | text2image, image variation |
 | [Stochastic Karras VE](stochastic_karras_ve) | unconditional image generation |
 | [T2I-Adapter](stable_diffusion/adapter) | text2image |
--- a/docs/source/en/api/pipelines/paradigms.md
+++ b/docs/source/en/api/pipelines/paradigms.md
@@ -1,51 +0,0 @@
-<!--Copyright 2023 ParaDiGMS authors and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Parallel Sampling of Diffusion Models
-
-[Parallel Sampling of Diffusion Models](https://huggingface.co/papers/2305.16317) is by Andy Shih, Suneel Belkhale, Stefano Ermon, Dorsa Sadigh, Nima Anari.
-
-The abstract from the paper is:
-
-*Diffusion models are powerful generative models but suffer from slow sampling, often taking 1000 sequential denoising steps for one sample. As a result, considerable efforts have been directed toward reducing the number of denoising steps, but these methods hurt sample quality. Instead of reducing the number of denoising steps (trading quality for speed), in this paper we explore an orthogonal approach: can we run the denoising steps in parallel (trading compute for speed)? In spite of the sequential nature of the denoising steps, we show that surprisingly it is possible to parallelize sampling via Picard iterations, by guessing the solution of future denoising steps and iteratively refining until convergence. With this insight, we present ParaDiGMS, a novel method to accelerate the sampling of pretrained diffusion models by denoising multiple steps in parallel. ParaDiGMS is the first diffusion sampling method that enables trading compute for speed and is even compatible with existing fast sampling techniques such as DDIM and DPMSolver. Using ParaDiGMS, we improve sampling speed by 2-4x across a range of robotics and image generation models, giving state-of-the-art sampling speeds of 0.2s on 100-step DiffusionPolicy and 14.6s on 1000-step StableDiffusion-v2 with no measurable degradation of task reward, FID score, or CLIP score.*
-
-The original codebase can be found at [AndyShih12/paradigms](https://github.com/AndyShih12/paradigms), and the pipeline was contributed by [AndyShih12](https://github.com/AndyShih12). ❤️
-
-## Tips
-
-This pipeline improves sampling speed by running denoising steps in parallel, at the cost of increased total FLOPs.
-Therefore, it is better to call this pipeline when running on multiple GPUs. Otherwise, without enough GPU bandwidth
-sampling may be even slower than sequential sampling.
-
-The two parameters to play with are `parallel` (batch size) and `tolerance`.
- If it fits in memory, for a 1000-step DDPM you can aim for a batch size of around 100 (for example, 8 GPUs and `batch_per_device=12` to get `parallel=96`). A higher batch size may not fit in memory, and lower batch size gives less parallelism.
- For tolerance, using a higher tolerance may get better speedups but can risk sample quality degradation. If there is quality degradation with the default tolerance, then use a lower tolerance like `0.001`.
-
-For a 1000-step DDPM on 8 A100 GPUs, you can expect around a 3x speedup from [`StableDiffusionParadigmsPipeline`] compared to the [`StableDiffusionPipeline`]
-by setting `parallel=80` and `tolerance=0.1`.
-
-🤗 Diffusers offers [distributed inference support](../../training/distributed_inference) for generating multiple prompts
-in parallel on multiple GPUs. But [`StableDiffusionParadigmsPipeline`] is designed for speeding up sampling of a single prompt by using multiple GPUs.
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## StableDiffusionParadigmsPipeline
-[[autodoc]] StableDiffusionParadigmsPipeline
-	- __call__
-	- all
-
-## StableDiffusionPipelineOutput
-[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
--- a/docs/source/en/api/pipelines/pix2pix_zero.md
+++ b/docs/source/en/api/pipelines/pix2pix_zero.md
@@ -1,289 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Pix2Pix Zero
-
-[Zero-shot Image-to-Image Translation](https://huggingface.co/papers/2302.03027) is by Gaurav Parmar, Krishna Kumar Singh, Richard Zhang, Yijun Li, Jingwan Lu, and Jun-Yan Zhu.
-
-The abstract from the paper is:
-
-*Large-scale text-to-image generative models have shown their remarkable ability to synthesize diverse and high-quality images. However, it is still challenging to directly apply these models for editing real images for two reasons. First, it is hard for users to come up with a perfect text prompt that accurately describes every visual detail in the input image. Second, while existing models can introduce desirable changes in certain regions, they often dramatically alter the input content and introduce unexpected changes in unwanted regions. In this work, we propose pix2pix-zero, an image-to-image translation method that can preserve the content of the original image without manual prompting. We first automatically discover editing directions that reflect desired edits in the text embedding space. To preserve the general content structure after editing, we further propose cross-attention guidance, which aims to retain the cross-attention maps of the input image throughout the diffusion process. In addition, our method does not need additional training for these edits and can directly use the existing pre-trained text-to-image diffusion model. We conduct extensive experiments and show that our method outperforms existing and concurrent works for both real and synthetic image editing.*
-
-You can find additional information about Pix2Pix Zero on the [project page](https://pix2pixzero.github.io/),  [original codebase](https://github.com/pix2pixzero/pix2pix-zero), and try it out in a [demo](https://huggingface.co/spaces/pix2pix-zero-library/pix2pix-zero-demo).
-
-## Tips
-
-* The pipeline can be conditioned on real input images. Check out the code examples below to know more.
-* The pipeline exposes two arguments namely `source_embeds` and `target_embeds`
-that let you control the direction of the semantic edits in the final image to be generated. Let's say,
-you wanted to translate from "cat" to "dog". In this case, the edit direction will be "cat -> dog". To reflect
-this in the pipeline, you simply have to set the embeddings related to the phrases including "cat" to
-`source_embeds` and "dog" to `target_embeds`. Refer to the code example below for more details.
-* When you're using this pipeline from a prompt, specify the _source_ concept in the prompt. Taking
-the above example, a valid input prompt would be: "a high resolution painting of a **cat** in the style of van gogh".
-* If you wanted to reverse the direction in the example above, i.e., "dog -> cat", then it's recommended to:
-    * Swap the `source_embeds` and `target_embeds`.
-    * Change the input prompt to include "dog".
-* To learn more about how the source and target embeddings are generated, refer to the [original paper](https://arxiv.org/abs/2302.03027). Below, we also provide some directions on how to generate the embeddings.
-* Note that the quality of the outputs generated with this pipeline is dependent on how good the `source_embeds` and `target_embeds` are. Please, refer to [this discussion](#generating-source-and-target-embeddings) for some suggestions on the topic.
-
-## Available Pipelines:
-
-| Pipeline | Tasks | Demo
-|---|---|:---:|
-| [StableDiffusionPix2PixZeroPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py) | *Text-Based Image Editing* | [🤗 Space](https://huggingface.co/spaces/pix2pix-zero-library/pix2pix-zero-demo) |
-
-<!-- TODO: add Colab -->
-
-## Usage example
-
-### Based on an image generated with the input prompt
-
-```python
-import requests
-import torch
-
-from diffusers import DDIMScheduler, StableDiffusionPix2PixZeroPipeline
-
-
-def download(embedding_url, local_filepath):
-    r = requests.get(embedding_url)
-    with open(local_filepath, "wb") as f:
-        f.write(r.content)
-
-
-model_ckpt = "CompVis/stable-diffusion-v1-4"
-pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-    model_ckpt, conditions_input_image=False, torch_dtype=torch.float16
-)
-pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
-pipeline.to("cuda")
-
-prompt = "a high resolution painting of a cat in the style of van gogh"
-src_embs_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/embeddings_sd_1.4/cat.pt"
-target_embs_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/embeddings_sd_1.4/dog.pt"
-
-for url in [src_embs_url, target_embs_url]:
-    download(url, url.split("/")[-1])
-
-src_embeds = torch.load(src_embs_url.split("/")[-1])
-target_embeds = torch.load(target_embs_url.split("/")[-1])
-
-image = pipeline(
-    prompt,
-    source_embeds=src_embeds,
-    target_embeds=target_embeds,
-    num_inference_steps=50,
-    cross_attention_guidance_amount=0.15,
-).images[0]
-image
-```
-
-### Based on an input image
-
-When the pipeline is conditioned on an input image, we first obtain an inverted
-noise from it using a `DDIMInverseScheduler` with the help of a generated caption. Then the inverted noise is used to start the generation process.
-
-First, let's load our pipeline:
-
-```py
-import torch
-from transformers import BlipForConditionalGeneration, BlipProcessor
-from diffusers import DDIMScheduler, DDIMInverseScheduler, StableDiffusionPix2PixZeroPipeline
-
-captioner_id = "Salesforce/blip-image-captioning-base"
-processor = BlipProcessor.from_pretrained(captioner_id)
-model = BlipForConditionalGeneration.from_pretrained(captioner_id, torch_dtype=torch.float16, low_cpu_mem_usage=True)
-
-sd_model_ckpt = "CompVis/stable-diffusion-v1-4"
-pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-    sd_model_ckpt,
-    caption_generator=model,
-    caption_processor=processor,
-    torch_dtype=torch.float16,
-    safety_checker=None,
-)
-pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
-pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
-pipeline.enable_model_cpu_offload()
-```
-
-Then, we load an input image for conditioning and obtain a suitable caption for it:
-
-```py
-from diffusers.utils import load_image
-
-img_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/test_images/cats/cat_6.png"
-raw_image = load_image(url).resize((512, 512))
-caption = pipeline.generate_caption(raw_image)
-caption
-```
-
-Then we employ the generated caption and the input image to get the inverted noise:
-
-```py
-generator = torch.manual_seed(0)
-inv_latents = pipeline.invert(caption, image=raw_image, generator=generator).latents
-```
-
-Now, generate the image with edit directions:
-
-```py
-# See the "Generating source and target embeddings" section below to
-# automate the generation of these captions with a pre-trained model like Flan-T5 as explained below.
-source_prompts = ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
-target_prompts = ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
-
-source_embeds = pipeline.get_embeds(source_prompts, batch_size=2)
-target_embeds = pipeline.get_embeds(target_prompts, batch_size=2)
-
-
-image = pipeline(
-    caption,
-    source_embeds=source_embeds,
-    target_embeds=target_embeds,
-    num_inference_steps=50,
-    cross_attention_guidance_amount=0.15,
-    generator=generator,
-    latents=inv_latents,
-    negative_prompt=caption,
-).images[0]
-image
-```
-
-## Generating source and target embeddings
-
-The authors originally used the [GPT-3 API](https://openai.com/api/) to generate the source and target captions for discovering
-edit directions. However, we can also leverage open source and public models for the same purpose.
-Below, we provide an end-to-end example with the [Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5) model
-for generating captions and [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for
-computing embeddings on the generated captions.
-
-**1. Load the generation model**:
-
-```py
-import torch
-from transformers import AutoTokenizer, T5ForConditionalGeneration
-
-tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
-model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto", torch_dtype=torch.float16)
-```
-
-**2. Construct a starting prompt**:
-
-```py
-source_concept = "cat"
-target_concept = "dog"
-
-source_text = f"Provide a caption for images containing a {source_concept}. "
-"The captions should be in English and should be no longer than 150 characters."
-
-target_text = f"Provide a caption for images containing a {target_concept}. "
-"The captions should be in English and should be no longer than 150 characters."
-```
-
-Here, we're interested in the "cat -> dog" direction.
-
-**3. Generate captions**:
-
-We can use a utility like so for this purpose.
-
-```py
-def generate_captions(input_prompt):
-    input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to("cuda")
-
-    outputs = model.generate(
-        input_ids, temperature=0.8, num_return_sequences=16, do_sample=True, max_new_tokens=128, top_k=10
-    )
-    return tokenizer.batch_decode(outputs, skip_special_tokens=True)
-```
-
-And then we just call it to generate our captions:
-
-```py
-source_captions = generate_captions(source_text)
-target_captions = generate_captions(target_concept)
-print(source_captions, target_captions, sep='\n')
-```
-
-We encourage you to play around with the different parameters supported by the
-`generate()` method ([documentation](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.generation_tf_utils.TFGenerationMixin.generate)) for the generation quality you are looking for.
-
-**4. Load the embedding model**:
-
-Here, we need to use the same text encoder model used by the subsequent Stable Diffusion model.
-
-```py
-from diffusers import StableDiffusionPix2PixZeroPipeline
-
-pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
-)
-pipeline = pipeline.to("cuda")
-tokenizer = pipeline.tokenizer
-text_encoder = pipeline.text_encoder
-```
-
-**5. Compute embeddings**:
-
-```py
-import torch
-
-def embed_captions(sentences, tokenizer, text_encoder, device="cuda"):
-    with torch.no_grad():
-        embeddings = []
-        for sent in sentences:
-            text_inputs = tokenizer(
-                sent,
-                padding="max_length",
-                max_length=tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            prompt_embeds = text_encoder(text_input_ids.to(device), attention_mask=None)[0]
-            embeddings.append(prompt_embeds)
-    return torch.concatenate(embeddings, dim=0).mean(dim=0).unsqueeze(0)
-
-source_embeddings = embed_captions(source_captions, tokenizer, text_encoder)
-target_embeddings = embed_captions(target_captions, tokenizer, text_encoder)
-```
-
-And you're done! [Here](https://colab.research.google.com/drive/1tz2C1EdfZYAPlzXXbTnf-5PRBiR8_R1F?usp=sharing) is a Colab Notebook that you can use to interact with the entire process.
-
-Now, you can use these embeddings directly while calling the pipeline:
-
-```py
-from diffusers import DDIMScheduler
-
-pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
-
-image = pipeline(
-    prompt,
-    source_embeds=source_embeddings,
-    target_embeds=target_embeddings,
-    num_inference_steps=50,
-    cross_attention_guidance_amount=0.15,
-).images[0]
-image
-```
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## StableDiffusionPix2PixZeroPipeline
-[[autodoc]] StableDiffusionPix2PixZeroPipeline
-	- __call__
-	- all
--- a/docs/source/en/api/pipelines/pixart.md
+++ b/docs/source/en/api/pipelines/pixart.md
@@ -35,6 +35,112 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)

 </Tip>

+## Inference with under 8GB GPU VRAM
+
+Run the [`PixArtAlphaPipeline`] with under 8GB GPU VRAM by loading the text encoder in 8-bit precision. Let's walk through a full-fledged example. 
+
+First, install the [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) library:
+
+```bash
+pip install -U bitsandbytes
+```
+
+Then load the text encoder in 8-bit:
+
+```python
+from transformers import T5EncoderModel
+from diffusers import PixArtAlphaPipeline
+import torch
+
+text_encoder = T5EncoderModel.from_pretrained(
+    "PixArt-alpha/PixArt-XL-2-1024-MS",
+    subfolder="text_encoder",
+    load_in_8bit=True,
+    device_map="auto",
+
+)
+pipe = PixArtAlphaPipeline.from_pretrained(
+    "PixArt-alpha/PixArt-XL-2-1024-MS",
+    text_encoder=text_encoder,
+    transformer=None,
+    device_map="auto"
+)
+```
+
+Now, use the `pipe` to encode a prompt:
+
+```python
+with torch.no_grad():
+    prompt = "cute cat"
+    prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt)
+```
+
+Since text embeddings have been computed, remove the `text_encoder` and `pipe` from the memory, and free up som GPU VRAM:
+
+```python
+import gc 
+
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+del text_encoder
+del pipe
+flush()
+```
+
+Then compute the latents with the prompt embeddings as inputs:
+
+```python
+pipe = PixArtAlphaPipeline.from_pretrained(
+    "PixArt-alpha/PixArt-XL-2-1024-MS",
+    text_encoder=None,
+    torch_dtype=torch.float16,
+).to("cuda")
+
+latents = pipe(
+    negative_prompt=None, 
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_embeds,
+    prompt_attention_mask=prompt_attention_mask,
+    negative_prompt_attention_mask=negative_prompt_attention_mask,
+    num_images_per_prompt=1,
+    output_type="latent",
+).images
+
+del pipe.transformer
+flush()
+```
+
+<Tip>
+
+Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded.
+
+</Tip>
+
+Once the latents are computed, pass it off to the VAE to decode into a real image:
+
+```python
+with torch.no_grad():
+    image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
+image = pipe.image_processor.postprocess(image, output_type="pil")[0]
+image.save("cat.png")
+```
+
+By deleting components you aren't using and flushing the GPU VRAM, you should be able to run [`PixArtAlphaPipeline`] with under 8GB GPU VRAM.
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pixart/8bits_cat.png)
+
+If you want a report of your memory-usage, run this [script](https://gist.github.com/sayakpaul/3ae0f847001d342af27018a96f467e4e).
+
+<Tip warning={true}>
+
+Text embeddings computed in 8-bit can impact the quality of the generated images because of the information loss in the representation space caused by the reduced precision. It's recommended to compare the outputs with and without 8-bit.
+
+</Tip>
+
+While loading the `text_encoder`, you set `load_in_8bit` to `True`. You could also specify `load_in_4bit` to bring your memory requirements down even further to under 7GB.
+
 ## PixArtAlphaPipeline

 [[autodoc]] PixArtAlphaPipeline
--- a/docs/source/en/api/pipelines/pndm.md
+++ b/docs/source/en/api/pipelines/pndm.md
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# PNDM
-
-[Pseudo Numerical Methods for Diffusion Models on Manifolds](https://huggingface.co/papers/2202.09778) (PNDM) is by Luping Liu, Yi Ren, Zhijie Lin and Zhou Zhao.
-
-The abstract from the paper is:
-
-*Denoising Diffusion Probabilistic Models (DDPMs) can generate high-quality samples such as image and audio samples. However, DDPMs require hundreds to thousands of iterations to produce final samples. Several prior works have successfully accelerated DDPMs through adjusting the variance schedule (e.g., Improved Denoising Diffusion Probabilistic Models) or the denoising equation (e.g., Denoising Diffusion Implicit Models (DDIMs)). However, these acceleration methods cannot maintain the quality of samples and even introduce new noise at a high speedup rate, which limit their practicability. To accelerate the inference process while keeping the sample quality, we provide a fresh perspective that DDPMs should be treated as solving differential equations on manifolds. Under such a perspective, we propose pseudo numerical methods for diffusion models (PNDMs). Specifically, we figure out how to solve differential equations on manifolds and show that DDIMs are simple cases of pseudo numerical methods. We change several classical numerical methods to corresponding pseudo numerical methods and find that the pseudo linear multi-step method is the best in most situations. According to our experiments, by directly using pre-trained models on Cifar10, CelebA and LSUN, PNDMs can generate higher quality synthetic images with only 50 steps compared with 1000-step DDIMs (20x speedup), significantly outperform DDIMs with 250 steps (by around 0.4 in FID) and have good generalization on different variance schedules.*
-
-The original codebase can be found at [luping-liu/PNDM](https://github.com/luping-liu/PNDM).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## PNDMPipeline
-[[autodoc]] PNDMPipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
--- a/docs/source/en/api/pipelines/repaint.md
+++ b/docs/source/en/api/pipelines/repaint.md
@@ -1,37 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# RePaint
-
-[RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://huggingface.co/papers/2201.09865) is by Andreas Lugmayr, Martin Danelljan, Andres Romero, Fisher Yu, Radu Timofte, Luc Van Gool.
-
-The abstract from the paper is:
-
-*Free-form inpainting is the task of adding new content to an image in the regions specified by an arbitrary binary mask. Most existing approaches train for a certain distribution of masks, which limits their generalization capabilities to unseen mask types. Furthermore, training with pixel-wise and perceptual losses often leads to simple textural extensions towards the missing areas instead of semantically meaningful generation. In this work, we propose RePaint: A Denoising Diffusion Probabilistic Model (DDPM) based inpainting approach that is applicable to even extreme masks. We employ a pretrained unconditional DDPM as the generative prior. To condition the generation process, we only alter the reverse diffusion iterations by sampling the unmasked regions using the given image information. Since this technique does not modify or condition the original DDPM network itself, the model produces high-quality and diverse output images for any inpainting form. We validate our method for both faces and general-purpose image inpainting using standard and extreme masks.
-RePaint outperforms state-of-the-art Autoregressive, and GAN approaches for at least five out of six mask distributions.*
-
-The original codebase can be found at [andreas128/RePaint](https://github.com/andreas128/RePaint).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-
-## RePaintPipeline
-[[autodoc]] RePaintPipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
--- a/docs/source/en/api/pipelines/score_sde_ve.md
+++ b/docs/source/en/api/pipelines/score_sde_ve.md
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Score SDE VE
-
-[Score-Based Generative Modeling through Stochastic Differential Equations](https://huggingface.co/papers/2011.13456) (Score SDE) is by Yang Song, Jascha Sohl-Dickstein, Diederik P. Kingma, Abhishek Kumar, Stefano Ermon and Ben Poole. This pipeline implements the variance expanding (VE) variant of the stochastic differential equation method.
-
-The abstract from the paper is:
-
-*Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. Crucially, the reverse-time SDE depends only on the time-dependent gradient field (\aka, score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model.*
-
-The original codebase can be found at [yang-song/score_sde_pytorch](https://github.com/yang-song/score_sde_pytorch).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## ScoreSdeVePipeline
-[[autodoc]] ScoreSdeVePipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
--- a/docs/source/en/api/pipelines/spectrogram_diffusion.md
+++ b/docs/source/en/api/pipelines/spectrogram_diffusion.md
@@ -1,37 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Spectrogram Diffusion
-
-[Spectrogram Diffusion](https://huggingface.co/papers/2206.05408) is by Curtis Hawthorne, Ian Simon, Adam Roberts, Neil Zeghidour, Josh Gardner, Ethan Manilow, and Jesse Engel.
-
-*An ideal music synthesizer should be both interactive and expressive, generating high-fidelity audio in realtime for arbitrary combinations of instruments and notes. Recent neural synthesizers have exhibited a tradeoff between domain-specific models that offer detailed control of only specific instruments, or raw waveform models that can train on any music but with minimal control and slow generation. In this work, we focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. We use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. We compare training the decoder as an autoregressive model and as a Denoising Diffusion Probabilistic Model (DDPM) and find that the DDPM approach is superior both qualitatively and as measured by audio reconstruction and Fréchet distance metrics. Given the interactivity and generality of this approach, we find this to be a promising first step towards interactive and expressive neural synthesis for arbitrary combinations of instruments and notes.*
-
-The original codebase can be found at [magenta/music-spectrogram-diffusion](https://github.com/magenta/music-spectrogram-diffusion).
-
-![img](https://storage.googleapis.com/music-synthesis-with-spectrogram-diffusion/architecture.png)
-
-As depicted above the model takes as input a MIDI file and tokenizes it into a sequence of 5 second intervals. Each tokenized interval then together with positional encodings is passed through the Note Encoder and its representation is concatenated with the previous window's generated spectrogram representation obtained via the Context Encoder. For the initial 5 second window this is set to zero. The resulting context is then used as conditioning to sample the denoised Spectrogram from the MIDI window and we concatenate this spectrogram to the final output as well as use it for the context of the next MIDI window. The process repeats till we have gone over all the MIDI inputs. Finally a MelGAN decoder converts the potentially long spectrogram to audio which is the final result of this pipeline.
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## SpectrogramDiffusionPipeline
-[[autodoc]] SpectrogramDiffusionPipeline
-	- all
-	- __call__
-
-## AudioPipelineOutput
-[[autodoc]] pipelines.AudioPipelineOutput
--- a/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md
@@ -14,6 +14,11 @@ specific language governing permissions and limitations under the License.

 LDM3D was proposed in [LDM3D: Latent Diffusion Model for 3D](https://huggingface.co/papers/2305.10853) by Gabriela Ben Melech Stan, Diana Wofk, Scottie Fox, Alex Redden, Will Saxton, Jean Yu, Estelle Aflalo, Shao-Yen Tseng, Fabio Nonato, Matthias Muller, and Vasudev Lal. LDM3D generates an image and a depth map from a given text prompt unlike the existing text-to-image diffusion models such as [Stable Diffusion](./overview) which only generates an image. With almost the same number of parameters, LDM3D achieves to create a latent space that can compress both the RGB images and the depth maps. 

+Two checkpoints are available for use:
+- [ldm3d-original](https://huggingface.co/Intel/ldm3d). The original checkpoint used in the [paper](https://arxiv.org/pdf/2305.10853.pdf)
+- [ldm3d-4c](https://huggingface.co/Intel/ldm3d-4c). The new version of LDM3D using 4 channels inputs instead of 6-channels inputs and finetuned on higher resolution images. 
+
+
 The abstract from the paper is:

 *This research paper proposes a Latent Diffusion Model for 3D (LDM3D) that generates both image and depth map data from a given text prompt, allowing users to generate RGBD images from text prompts. The LDM3D model is fine-tuned on a dataset of tuples containing an RGB image, depth map and caption, and validated through extensive experiments. We also develop an application called DepthFusion, which uses the generated RGB images and depth maps to create immersive and interactive 360-degree-view experiences using TouchDesigner. This technology has the potential to transform a wide range of industries, from entertainment and gaming to architecture and design. Overall, this paper presents a significant contribution to the field of generative AI and computer vision, and showcases the potential of LDM3D and DepthFusion to revolutionize content creation and digital experiences. A short video summarizing the approach can be found at [this url](https://t.ly/tdi2).*
@@ -26,12 +31,25 @@ Make sure to check out the Stable Diffusion [Tips](overview#tips) section to lea

 ## StableDiffusionLDM3DPipeline

-[[autodoc]] StableDiffusionLDM3DPipeline
+[[autodoc]] pipelines.stable_diffusion_ldm3d.pipeline_stable_diffusion_ldm3d.StableDiffusionLDM3DPipeline
 	- all
 	- __call__

+
 ## LDM3DPipelineOutput

-[[autodoc]] pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d.LDM3DPipelineOutput
+[[autodoc]] pipelines.stable_diffusion_ldm3d.pipeline_stable_diffusion_ldm3d.LDM3DPipelineOutput
 	- all
 	- __call__
+
+# Upscaler
+
+[LDM3D-VR](https://arxiv.org/pdf/2311.03226.pdf) is an extended version of LDM3D. 
+
+The abstract from the paper is:
+*Latent diffusion models have proven to be state-of-the-art in the creation and manipulation of visual outputs. However, as far as we know, the generation of depth maps jointly with RGB is still limited. We introduce LDM3D-VR, a suite of diffusion models targeting virtual reality development that includes LDM3D-pano and LDM3D-SR. These models enable the generation of panoramic RGBD based on textual prompts and the upscaling of low-resolution inputs to high-resolution RGBD, respectively. Our models are fine-tuned from existing pretrained models on datasets containing panoramic/high-resolution RGB images, depth maps and captions. Both models are evaluated in comparison to existing related methods*
+
+Two checkpoints are available for use:
+- [ldm3d-pano](https://huggingface.co/Intel/ldm3d-pano). This checkpoint enables the generation of panoramic images and requires the StableDiffusionLDM3DPipeline pipeline to be used.
+- [ldm3d-sr](https://huggingface.co/Intel/ldm3d-sr). This checkpoint enables the upscaling of RGB and depth images. Can be used in cascade after the original LDM3D pipeline using the StableDiffusionUpscaleLDM3DPipeline from communauty pipeline.
+
--- a/docs/source/en/api/pipelines/stable_diffusion/overview.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/overview.md
@@ -121,10 +121,16 @@ The table below summarizes the available Stable Diffusion pipelines, their suppo
            <td class="px-4 py-2 text-gray-700">
            <a href="./ldm3d_diffusion">StableDiffusionLDM3D</a>
            </td>
-            <td class="px-4 py-2 text-gray-700">text-to-rgb, text-to-depth</td>
+            <td class="px-4 py-2 text-gray-700">text-to-rgb, text-to-depth, text-to-pano</td>
            <td class="px-4 py-2"><a href="https://huggingface.co/spaces/r23/ldm3d-space"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue"/></a>
            </td>
        </tr>
+        <tr>
+            <td class="px-4 py-2 text-gray-700">
+            <a href="./ldm3d_diffusion">StableDiffusionUpscaleLDM3D</a>
+            </td>
+            <td class="px-4 py-2 text-gray-700">ldm3d super-resolution</td>
+        </tr>
        </tbody>
    </table>
    </div>
--- a/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md
@@ -0,0 +1,35 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# SDXL Turbo
+
+Stable Diffusion XL (SDXL) Turbo was proposed in [Adversarial Diffusion Distillation](https://stability.ai/research/adversarial-diffusion-distillation) by Axel Sauer, Dominik Lorenz, Andreas Blattmann, and Robin Rombach.
+
+The abstract from the paper is:
+
+*We introduce Adversarial Diffusion Distillation (ADD), a novel training approach that efficiently samples large-scale foundational image diffusion models in just 1–4 steps while maintaining high image quality. We use score distillation to leverage large-scale off-the-shelf image diffusion models as a teacher signal in combination with an adversarial loss to ensure high image fidelity even in the low-step regime of one or two sampling steps. Our analyses show that our model clearly outperforms existing few-step methods (GANs,Latent Consistency Models) in a single step and reaches the performance of state-of-the-art diffusion models (SDXL) in only four steps. ADD is the first method to unlock single-step, real-time image synthesis with foundation models.*
+
+## Tips
+
+- SDXL Turbo uses the exact same architecture as [SDXL](./stable_diffusion_xl), which means it also has the same API. Please refer to the [SDXL](./stable_diffusion_xl) API reference for more details.
+- SDXL Turbo should disable guidance scale by setting `guidance_scale=0.0`
+- SDXL Turbo should use `timestep_spacing='trailing'` for the scheduler and use between 1 and 4 steps.
+- SDXL Turbo has been trained to generate images of size 512x512.
+- SDXL Turbo is open-access, but not open-source meaning that one might have to buy a model license in order to use it for commercial applications. Make sure to read the [official model card](https://huggingface.co/stabilityai/sdxl-turbo) to learn more.
+
+<Tip>
+
+To learn how to use SDXL Turbo for various tasks, how to optimize performance, and other usage examples, take a look at the [SDXL Turbo](../../../using-diffusers/sdxl_turbo) guide.
+
+Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the official base and refiner model checkpoints!
+
+</Tip>
--- a/docs/source/en/api/pipelines/stochastic_karras_ve.md
+++ b/docs/source/en/api/pipelines/stochastic_karras_ve.md
@@ -1,33 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Stochastic Karras VE
-
-[Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) is by Tero Karras, Miika Aittala, Timo Aila and Samuli Laine. This pipeline implements the stochastic sampling tailored to variance expanding (VE) models.
-
-The abstract from the paper:
-
-*We argue that the theory and practice of diffusion-based generative models are currently unnecessarily convoluted and seek to remedy the situation by presenting a design space that clearly separates the concrete design choices. This lets us identify several changes to both the sampling and training processes, as well as preconditioning of the score networks. Together, our improvements yield new state-of-the-art FID of 1.79 for CIFAR-10 in a class-conditional setting and 1.97 in an unconditional setting, with much faster sampling (35 network evaluations per image) than prior designs. To further demonstrate their modular nature, we show that our design changes dramatically improve both the efficiency and quality obtainable with pre-trained score networks from previous work, including improving the FID of a previously trained ImageNet-64 model from 2.07 to near-SOTA 1.55, and after re-training with our proposed improvements to a new SOTA of 1.36.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## KarrasVePipeline
-[[autodoc]] KarrasVePipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
--- a/docs/source/en/api/pipelines/text_to_video_zero.md
+++ b/docs/source/en/api/pipelines/text_to_video_zero.md
@@ -92,6 +92,19 @@ imageio.mimsave("video.mp4", result, fps=4)
 ```


+- #### SDXL Support
+In order to use the SDXL model when generating a video from prompt, use the `TextToVideoZeroSDXLPipeline` pipeline:
+
+```python
+import torch
+from diffusers import TextToVideoZeroSDXLPipeline
+
+model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+pipe = TextToVideoZeroSDXLPipeline.from_pretrained(
+    model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+```
+
 ### Text-To-Video with Pose Control
 To generate a video from prompt with additional pose control

@@ -141,7 +154,33 @@ To generate a video from prompt with additional pose control
    result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
    imageio.mimsave("video.mp4", result, fps=4)
    ```
-
+- #### SDXL Support
+	
+	Since our attention processor also works with SDXL, it can be utilized to generate a video from prompt using ControlNet models powered by SDXL:
+	```python
+	import torch
+	from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel
+	from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
+	
+	controlnet_model_id = 'thibaud/controlnet-openpose-sdxl-1.0'
+	model_id = 'stabilityai/stable-diffusion-xl-base-1.0'
+	
+	controlnet = ControlNetModel.from_pretrained(controlnet_model_id, torch_dtype=torch.float16)
+	pipe = StableDiffusionControlNetPipeline.from_pretrained(
+		model_id, controlnet=controlnet, torch_dtype=torch.float16
+	).to('cuda')
+	
+	# Set the attention processor
+	pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
+	pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
+	
+	# fix latents for all frames
+	latents = torch.randn((1, 4, 128, 128), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
+	
+	prompt = "Darth Vader dancing in a desert"
+	result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
+	imageio.mimsave("video.mp4", result, fps=4)
+	```

 ### Text-To-Video with Edge Control

@@ -253,5 +292,10 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 	- all
 	- __call__

+## TextToVideoZeroSDXLPipeline
+[[autodoc]] TextToVideoZeroSDXLPipeline
+	- all
+	- __call__
+
 ## TextToVideoPipelineOutput
 [[autodoc]] pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoPipelineOutput
--- a/docs/source/en/api/pipelines/value_guided_sampling.md
+++ b/docs/source/en/api/pipelines/value_guided_sampling.md
@@ -24,7 +24,7 @@ The abstract from the paper is:

 *Model-based reinforcement learning methods often use learning only for the purpose of estimating an approximate dynamics model, offloading the rest of the decision-making work to classical trajectory optimizers. While conceptually simple, this combination has a number of empirical shortcomings, suggesting that learned models may not be well-suited to standard trajectory optimization. In this paper, we consider what it would look like to fold as much of the trajectory optimization pipeline as possible into the modeling problem, such that sampling from the model and planning with it become nearly identical. The core of our technical approach lies in a diffusion probabilistic model that plans by iteratively denoising trajectories. We show how classifier-guided sampling and image inpainting can be reinterpreted as coherent planning strategies, explore the unusual and useful properties of diffusion-based planning methods, and demonstrate the effectiveness of our framework in control settings that emphasize long-horizon decision-making and test-time flexibility.*

-You can find additional information about the model on the [project page](https://diffusion-planning.github.io/), the [original codebase](https://github.com/jannerm/diffuser), or try it out in a demo [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb).
+You can find additional information about the model on the [project page](https://diffusion-planning.github.io/), the [original codebase](https://github.com/jannerm/diffuser), or try it out in a demo [notebook](https://colab.research.google.com/drive/1rXm8CX4ZdN5qivjJ2lhwhkOmt_m0CvU0#scrollTo=6HXJvhyqcITc&uniqifier=1).

 The script to run the model is available [here](https://github.com/huggingface/diffusers/tree/main/examples/reinforcement_learning).

--- a/docs/source/en/api/pipelines/versatile_diffusion.md
+++ b/docs/source/en/api/pipelines/versatile_diffusion.md
@@ -1,54 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Versatile Diffusion
-
-Versatile Diffusion was proposed in [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://huggingface.co/papers/2211.08332) by Xingqian Xu, Zhangyang Wang, Eric Zhang, Kai Wang, Humphrey Shi.
-
-The abstract from the paper is:
-
-*Recent advances in diffusion models have set an impressive milestone in many generation tasks, and trending works such as DALL-E2, Imagen, and Stable Diffusion have attracted great interest. Despite the rapid landscape changes, recent new approaches focus on extensions and performance rather than capacity, thus requiring separate models for separate tasks. In this work, we expand the existing single-flow diffusion pipeline into a multi-task multimodal network, dubbed Versatile Diffusion (VD), that handles multiple flows of text-to-image, image-to-text, and variations in one unified model. The pipeline design of VD instantiates a unified multi-flow diffusion framework, consisting of sharable and swappable layer modules that enable the crossmodal generality beyond images and text. Through extensive experiments, we demonstrate that VD successfully achieves the following: a) VD outperforms the baseline approaches and handles all its base tasks with competitive quality; b) VD enables novel extensions such as disentanglement of style and semantics, dual- and multi-context blending, etc.; c) The success of our multi-flow multimodal framework over images and text may inspire further diffusion-based universal AI research.*
-
-## Tips
-
-You can load the more memory intensive "all-in-one" [`VersatileDiffusionPipeline`] that supports all the tasks or use the individual pipelines which are more memory efficient.
-
-| **Pipeline**                                         | **Supported tasks**               |
-|------------------------------------------------------|-----------------------------------|
-| [`VersatileDiffusionPipeline`]                       | all of the below                  |
-| [`VersatileDiffusionTextToImagePipeline`]            | text-to-image                     |
-| [`VersatileDiffusionImageVariationPipeline`]         | image variation                   |
-| [`VersatileDiffusionDualGuidedPipeline`]             | image-text dual guided generation |
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## VersatileDiffusionPipeline
-[[autodoc]] VersatileDiffusionPipeline
-
-## VersatileDiffusionTextToImagePipeline
-[[autodoc]] VersatileDiffusionTextToImagePipeline
-	- all
-	- __call__
-
-## VersatileDiffusionImageVariationPipeline
-[[autodoc]] VersatileDiffusionImageVariationPipeline
-	- all
-	- __call__
-
-## VersatileDiffusionDualGuidedPipeline
-[[autodoc]] VersatileDiffusionDualGuidedPipeline
-	- all
-	- __call__
--- a/docs/source/en/api/pipelines/vq_diffusion.md
+++ b/docs/source/en/api/pipelines/vq_diffusion.md
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# VQ Diffusion
-
-[Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://huggingface.co/papers/2111.14822) is by Shuyang Gu, Dong Chen, Jianmin Bao, Fang Wen, Bo Zhang, Dongdong Chen, Lu Yuan, Baining Guo.
-
-The abstract from the paper is:
-
-*We present the vector quantized diffusion (VQ-Diffusion) model for text-to-image generation. This method is based on a vector quantized variational autoencoder (VQ-VAE) whose latent space is modeled by a conditional variant of the recently developed Denoising Diffusion Probabilistic Model (DDPM). We find that this latent-space method is well-suited for text-to-image generation tasks because it not only eliminates the unidirectional bias with existing methods but also allows us to incorporate a mask-and-replace diffusion strategy to avoid the accumulation of errors, which is a serious problem with existing methods. Our experiments show that the VQ-Diffusion produces significantly better text-to-image generation results when compared with conventional autoregressive (AR) models with similar numbers of parameters. Compared with previous GAN-based text-to-image methods, our VQ-Diffusion can handle more complex scenes and improve the synthesized image quality by a large margin. Finally, we show that the image generation computation in our method can be made highly efficient by reparameterization. With traditional AR methods, the text-to-image generation time increases linearly with the output image resolution and hence is quite time consuming even for normal size images. The VQ-Diffusion allows us to achieve a better trade-off between quality and speed. Our experiments indicate that the VQ-Diffusion model with the reparameterization is fifteen times faster than traditional AR methods while achieving a better image quality.*
-
-The original codebase can be found at [microsoft/VQ-Diffusion](https://github.com/microsoft/VQ-Diffusion).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## VQDiffusionPipeline
-[[autodoc]] VQDiffusionPipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
--- a/docs/source/en/api/schedulers/score_sde_vp.md
+++ b/docs/source/en/api/schedulers/score_sde_vp.md
@@ -25,4 +25,4 @@ The abstract from the paper is:
 </Tip>

 ## ScoreSdeVpScheduler
-[[autodoc]] schedulers.scheduling_sde_vp.ScoreSdeVpScheduler
+[[autodoc]] schedulers.deprecated.scheduling_sde_vp.ScoreSdeVpScheduler
--- a/docs/source/en/api/schedulers/stochastic_karras_ve.md
+++ b/docs/source/en/api/schedulers/stochastic_karras_ve.md
@@ -18,4 +18,4 @@ specific language governing permissions and limitations under the License.
 [[autodoc]] KarrasVeScheduler

 ## KarrasVeOutput
-[[autodoc]] schedulers.scheduling_karras_ve.KarrasVeOutput
+[[autodoc]] schedulers.deprecated.scheduling_karras_ve.KarrasVeOutput
--- a/docs/source/en/conceptual/contribution.md
+++ b/docs/source/en/conceptual/contribution.md
@@ -297,17 +297,37 @@ if you don't know yet what specific component you would like to add:
 - [Model or pipeline](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+pipeline%2Fmodel%22)
 - [Scheduler](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+scheduler%22)

-Before adding any of the three components, it is strongly recommended that you give the [Philosophy guide](philosophy) a read to better understand the design of any of the three components. Please be aware that
-we cannot merge model, scheduler, or pipeline additions that strongly diverge from our design philosophy
-as it will lead to API inconsistencies. If you fundamentally disagree with a design choice, please
-open a [Feedback issue](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=) instead so that it can be discussed whether a certain design
-pattern/design choice shall be changed everywhere in the library and whether we shall update our design philosophy. Consistency across the library is very important for us.
+Before adding any of the three components, it is strongly recommended that you give the [Philosophy guide](philosophy) a read to better understand the design of any of the three components. Please be aware that we cannot merge model, scheduler, or pipeline additions that strongly diverge from our design philosophy
+as it will lead to API inconsistencies. If you fundamentally disagree with a design choice, please open a [Feedback issue](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=) instead so that it can be discussed whether a certain design pattern/design choice shall be changed everywhere in the library and whether we shall update our design philosophy. Consistency across the library is very important for us.

-Please make sure to add links to the original codebase/paper to the PR and ideally also ping the
-original author directly on the PR so that they can follow the progress and potentially help with questions.
+Please make sure to add links to the original codebase/paper to the PR and ideally also ping the original author directly on the PR so that they can follow the progress and potentially help with questions.

 If you are unsure or stuck in the PR, don't hesitate to leave a message to ask for a first review or help.

+#### Copied from mechanism
+
+A unique and important feature to understand when adding any pipeline, model or scheduler code is the `# Copied from` mechanism. You'll see this all over the Diffusers codebase, and the reason we use it is to keep the codebase easy to understand and maintain. Marking code with the `# Copied from` mechanism forces the marked code to be identical to the code it was copied from. This makes it easy to update and propagate changes across many files whenever you run `make fix-copies`.
+
+For example, in the code example below, [`~diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is the original code and `AltDiffusionPipelineOutput` uses the `# Copied from` mechanism to copy it. The only difference is changing the class prefix from `Stable` to `Alt`.
+
+```py
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_output.StableDiffusionPipelineOutput with Stable->Alt
+class AltDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Alt Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+```
+
+To learn more, read this section of the [~Don't~ Repeat Yourself*](https://huggingface.co/blog/transformers-design-philosophy#4-machine-learning-models-are-static) blog post.
+
 ## How to write a good issue

 **The better your issue is written, the higher the chances that it will be quickly resolved.**
--- a/docs/source/en/training/lcm_distill.md
+++ b/docs/source/en/training/lcm_distill.md
@@ -0,0 +1,255 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Latent Consistency Distillation
+
+[Latent Consistency Models (LCMs)](https://hf.co/papers/2310.04378) are able to generate high-quality images in just a few steps, representing a big leap forward because many pipelines require at least 25+ steps. LCMs are produced by applying the latent consistency distillation method to any Stable Diffusion model. This method works by applying *one-stage guided distillation* to the latent space, and incorporating a *skipping-step* method to consistently skip timesteps to accelerate the distillation process (refer to section 4.1, 4.2, and 4.3 of the paper for more details).
+
+If you're training on a GPU with limited vRAM, try enabling `gradient_checkpointing`, `gradient_accumulation_steps`, and `mixed_precision` to reduce memory-usage and speedup training. You can reduce your memory-usage even more by enabling memory-efficient attention with [xFormers](../optimization/xformers) and [bitsandbytes'](https://github.com/TimDettmers/bitsandbytes) 8-bit optimizer.
+
+This guide will explore the [train_lcm_distill_sd_wds.py](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_sd_wds.py) script to help you become more familiar with it, and how you can adapt it for your own use-case.
+
+Before running the script, make sure you install the library from source:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then navigate to the example folder containing the training script and install the required dependencies for the script you're using:
+
+```bash
+cd examples/consistency_distillation
+pip install -r requirements.txt
+```
+
+<Tip>
+
+🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
+
+</Tip>
+
+Initialize an 🤗 Accelerate environment (try enabling `torch.compile` to significantly speedup training):
+
+```bash
+accelerate config
+```
+
+To setup a default 🤗 Accelerate environment without choosing any configurations:
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell, like a notebook, you can use:
+
+```bash
+from accelerate.utils import write_basic_config
+
+write_basic_config()
+```
+
+Lastly, if you want to train a model on your own dataset, take a look at the [Create a dataset for training](create_dataset) guide to learn how to create a dataset that works with the training script.
+
+## Script parameters
+
+<Tip>
+
+The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_sd_wds.py) and let us know if you have any questions or concerns.
+
+</Tip>
+
+The training script provides many parameters to help you customize your training run. All of the parameters and their descriptions are found in the [`parse_args()`](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L419) function. This function provides default values for each parameter, such as the training batch size and learning rate, but you can also set your own values in the training command if you'd like.
+
+For example, to speedup training with mixed precision using the fp16 format, add the `--mixed_precision` parameter to the training command:
+
+```bash
+accelerate launch train_lcm_distill_sd_wds.py \
+  --mixed_precision="fp16"
+```
+
+Most of the parameters are identical to the parameters in the [Text-to-image](text2image#script-parameters) training guide, so you'll focus on the parameters that are relevant to latent consistency distillation in this guide.
+
+- `--pretrained_teacher_model`: the path to a pretrained latent diffusion model to use as the teacher model
+- `--pretrained_vae_model_name_or_path`: path to a pretrained VAE; the SDXL VAE is known to suffer from numerical instability, so this parameter allows you to specify an alternative VAE (like this [VAE]((https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)) by madebyollin which works in fp16)
+- `--w_min` and `--w_max`: the minimum and maximum guidance scale values for guidance scale sampling
+- `--num_ddim_timesteps`: the number of timesteps for DDIM sampling
+- `--loss_type`: the type of loss (L2 or Huber) to calculate for latent consistency distillation; Huber loss is generally preferred because it's more robust to outliers
+- `--huber_c`: the Huber loss parameter
+
+## Training script
+
+The training script starts by creating a dataset class - [`Text2ImageDataset`](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L141) - for preprocessing the images and creating a training dataset.
+
+```py
+def transform(example):
+    image = example["image"]
+    image = TF.resize(image, resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+
+    c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(resolution, resolution))
+    image = TF.crop(image, c_top, c_left, resolution, resolution)
+    image = TF.to_tensor(image)
+    image = TF.normalize(image, [0.5], [0.5])
+
+    example["image"] = image
+    return example
+```
+
+For improved performance on reading and writing large datasets stored in the cloud, this script uses the [WebDataset](https://github.com/webdataset/webdataset) format to create a preprocessing pipeline to apply transforms and create a dataset and dataloader for training. Images are processed and fed to the training loop without having to download the full dataset first.
+
+```py
+processing_pipeline = [
+    wds.decode("pil", handler=wds.ignore_and_continue),
+    wds.rename(image="jpg;png;jpeg;webp", text="text;txt;caption", handler=wds.warn_and_continue),
+    wds.map(filter_keys({"image", "text"})),
+    wds.map(transform),
+    wds.to_tuple("image", "text"),
+]
+```
+
+In the [`main()`](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L768) function, all the necessary components like the noise scheduler, tokenizers, text encoders, and VAE are loaded. The teacher UNet is also loaded here and then you can create a student UNet from the teacher UNet. The student UNet is updated by the optimizer during training.
+
+```py
+teacher_unet = UNet2DConditionModel.from_pretrained(
+    args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
+)
+
+unet = UNet2DConditionModel(**teacher_unet.config)
+unet.load_state_dict(teacher_unet.state_dict(), strict=False)
+unet.train()
+```
+
+Now you can create the [optimizer](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L979) to update the UNet parameters:
+
+```py
+optimizer = optimizer_class(
+    unet.parameters(),
+    lr=args.learning_rate,
+    betas=(args.adam_beta1, args.adam_beta2),
+    weight_decay=args.adam_weight_decay,
+    eps=args.adam_epsilon,
+)
+```
+
+Create the [dataset](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L994):
+
+```py
+dataset = Text2ImageDataset(
+    train_shards_path_or_url=args.train_shards_path_or_url,
+    num_train_examples=args.max_train_samples,
+    per_gpu_batch_size=args.train_batch_size,
+    global_batch_size=args.train_batch_size * accelerator.num_processes,
+    num_workers=args.dataloader_num_workers,
+    resolution=args.resolution,
+    shuffle_buffer_size=1000,
+    pin_memory=True,
+    persistent_workers=True,
+)
+train_dataloader = dataset.train_dataloader
+```
+
+Next, you're ready to setup the [training loop](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L1049) and implement the latent consistency distillation method (see Algorithm 1 in the paper for more details). This section of the script takes care of adding noise to the latents, sampling and creating a guidance scale embedding, and predicting the original image from the noise.
+
+```py
+pred_x_0 = predicted_origin(
+    noise_pred,
+    start_timesteps,
+    noisy_model_input,
+    noise_scheduler.config.prediction_type,
+    alpha_schedule,
+    sigma_schedule,
+)
+
+model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0
+```
+
+It gets the [teacher model predictions](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L1172) and the [LCM predictions](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L1209) next, calculates the loss, and then backpropagates it to the LCM.
+
+```py
+if args.loss_type == "l2":
+    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+elif args.loss_type == "huber":
+    loss = torch.mean(
+        torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
+    )
+```
+
+If you want to learn more about how the training loop works, check out the [Understanding pipelines, models and schedulers tutorial](../using-diffusers/write_own_pipeline) which breaks down the basic pattern of the denoising process.
+
+## Launch the script
+
+Now you're ready to launch the training script and start distilling!
+
+For this guide, you'll use the `--train_shards_path_or_url` to specify the path to the [Conceptual Captions 12M](https://github.com/google-research-datasets/conceptual-12m) dataset stored on the Hub [here](https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset). Set the `MODEL_DIR` environment variable to the name of the teacher model and `OUTPUT_DIR` to where you want to save the model.
+
+```bash
+export MODEL_DIR="runwayml/stable-diffusion-v1-5"
+export OUTPUT_DIR="path/to/saved/model"
+
+accelerate launch train_lcm_distill_sd_wds.py \
+    --pretrained_teacher_model=$MODEL_DIR \
+    --output_dir=$OUTPUT_DIR \
+    --mixed_precision=fp16 \
+    --resolution=512 \
+    --learning_rate=1e-6 --loss_type="huber" --ema_decay=0.95 --adam_weight_decay=0.0 \
+    --max_train_steps=1000 \
+    --max_train_samples=4000000 \
+    --dataloader_num_workers=8 \
+    --train_shards_path_or_url="pipe:curl -L -s https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset/resolve/main/data/{00000..01099}.tar?download=true" \
+    --validation_steps=200 \
+    --checkpointing_steps=200 --checkpoints_total_limit=10 \
+    --train_batch_size=12 \
+    --gradient_checkpointing --enable_xformers_memory_efficient_attention \
+    --gradient_accumulation_steps=1 \
+    --use_8bit_adam \
+    --resume_from_checkpoint=latest \
+    --report_to=wandb \
+    --seed=453645634 \
+    --push_to_hub
+```
+
+Once training is complete, you can use your new LCM for inference.
+
+```py
+from diffusers import UNet2DConditionModel, DiffusionPipeline, LCMScheduler
+import torch
+
+unet = UNet2DConditionModel.from_pretrained("your-username/your-model", torch_dtype=torch.float16, variant="fp16")
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", unet=unet, torch_dtype=torch.float16, variant="fp16")
+
+pipeline.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+pipeline.to("cuda")
+
+prompt = "sushi rolls in the form of panda heads, sushi platter"
+
+image = pipeline(prompt, num_inference_steps=4, guidance_scale=1.0).images[0]
+```
+
+## LoRA
+
+LoRA is a training technique for significantly reducing the number of trainable parameters. As a result, training is faster and it is easier to store the resulting weights because they are a lot smaller (~100MBs). Use the [train_lcm_distill_lora_sd_wds.py](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py) or [train_lcm_distill_lora_sdxl.wds.py](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py) script to train with LoRA.
+
+The LoRA training script is discussed in more detail in the [LoRA training](lora) guide.
+
+## Stable Diffusion XL
+
+Stable Diffusion XL (SDXL) is a powerful text-to-image model that generates high-resolution images, and it adds a second text-encoder to its architecture. Use the [train_lcm_distill_sdxl_wds.py](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py) script to train a SDXL model with LoRA.
+
+The SDXL training script is discussed in more detail in the [SDXL training](sdxl) guide.
+
+## Next steps
+
+Congratulations on distilling a LCM model! To learn more about LCM, the following may be helpful:
+
+- Learn how to use [LCMs for inference](../using-diffusers/lcm) for text-to-image, image-to-image, and with LoRA checkpoints.
+- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
--- a/docs/source/en/training/lora.md
+++ b/docs/source/en/training/lora.md
@@ -179,7 +179,7 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_lora.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --dataset_name=$DATASET_NAME \
  --dataloader_num_workers=8 \
-  --resolution=512 
+  --resolution=512 \
  --center_crop \
  --random_flip \
  --train_batch_size=1 \
@@ -214,4 +214,4 @@ image = pipeline("A pokemon with blue eyes").images[0]
 Congratulations on training a new model with LoRA! To learn more about how to use your new model, the following guides may be helpful:

 - Learn how to [load different LoRA formats](../using-diffusers/loading_adapters#LoRA) trained using community trainers like Kohya and TheLastBen.
- Learn how to use and [combine multiple LoRA's](../tutorials/using_peft_for_inference) with PEFT for inference.
+- Learn how to use and [combine multiple LoRA's](../tutorials/using_peft_for_inference) with PEFT for inference.
--- a/docs/source/en/training/t2i_adapters.md
+++ b/docs/source/en/training/t2i_adapters.md
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # T2I-Adapter

-[T2I-Adapter]((https://hf.co/papers/2302.08453)) is a lightweight adapter model that provides an additional conditioning input image (line art, canny, sketch, depth, pose) to better control image generation. It is similar to a ControlNet, but it is a lot smaller (~77M parameters and ~300MB file size) because its only inserts weights into the UNet instead of copying and training it.
+[T2I-Adapter](https://hf.co/papers/2302.08453) is a lightweight adapter model that provides an additional conditioning input image (line art, canny, sketch, depth, pose) to better control image generation. It is similar to a ControlNet, but it is a lot smaller (~77M parameters and ~300MB file size) because its only inserts weights into the UNet instead of copying and training it.

 The T2I-Adapter is only available for training with the Stable Diffusion XL (SDXL) model.

@@ -224,4 +224,4 @@ image.save("./output.png")

 Congratulations on training a T2I-Adapter model! 🎉 To learn more:

- Read the [Efficient Controllable Generation for SDXL with T2I-Adapters](https://www.cs.cmu.edu/~custom-diffusion/) blog post to learn more details about the experimental results from the T2I-Adapter team.
+- Read the [Efficient Controllable Generation for SDXL with T2I-Adapters](https://huggingface.co/blog/t2i-sdxl-adapters) blog post to learn more details about the experimental results from the T2I-Adapter team.
--- a/docs/source/en/training/unconditional_training.md
+++ b/docs/source/en/training/unconditional_training.md
@@ -186,7 +186,7 @@ accelerate launch train_unconditional.py \
 If you're training with more than one GPU, add the `--multi_gpu` parameter to the training command:

 ```bash
-accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \
+accelerate launch --multi_gpu train_unconditional.py \
  --dataset_name="huggan/flowers-102-categories" \
  --output_dir="ddpm-ema-flowers-64" \
  --mixed_precision="fp16" \
--- a/docs/source/en/tutorials/fast_diffusion.md
+++ b/docs/source/en/tutorials/fast_diffusion.md
@@ -0,0 +1,342 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Accelerate inference of text-to-image diffusion models
+
+Diffusion models are known to be slower than their counter parts, GANs, because of the iterative and sequential reverse diffusion process. Recent works try to address limitation with:
+
+* progressive timestep distillation (such as [LCM LoRA](../using-diffusers/inference_with_lcm_lora.md))
+* model compression (such as [SSD-1B](https://huggingface.co/segmind/SSD-1B))
+* reusing adjacent features of the denoiser (such as [DeepCache](https://github.com/horseee/DeepCache))
+
+In this tutorial, we focus on leveraging the power of PyTorch 2 to accelerate the inference latency of text-to-image diffusion pipeline, instead. We will use [Stable Diffusion XL (SDXL)](../using-diffusers/sdxl.md) as a case study, but the techniques we will discuss should extend to other text-to-image diffusion pipelines.
+
+## Setup
+
+Make sure you're on the latest version of `diffusers`: 
+
+```bash
+pip install -U diffusers
+```
+
+Then upgrade the other required libraries too:
+
+```bash
+pip install -U transformers accelerate peft
+```
+
+To benefit from the fastest kernels, use PyTorch nightly. You can find the installation instructions [here](https://pytorch.org/). 
+
+To report the numbers shown below, we used an 80GB 400W A100 with its clock rate set to the maximum.
+
+_This tutorial doesn't present the benchmarking code and focuses on how to perform the optimizations, instead. For the full benchmarking code, refer to: [https://github.com/huggingface/diffusion-fast](https://github.com/huggingface/diffusion-fast)._
+
+## Baseline
+
+Let's start with a baseline. Disable the use of a reduced precision and [`scaled_dot_product_attention`](../optimization/torch2.0.md):
+
+```python
+from diffusers import StableDiffusionXLPipeline
+
+# Load the pipeline in full-precision and place its model components on CUDA.
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0"
+).to("cuda")
+
+# Run the attention ops without efficiency.
+pipe.unet.set_default_attn_processor()
+pipe.vae.set_default_attn_processor()
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+image = pipe(prompt, num_inference_steps=30).images[0]
+```
+
+This takes 7.36 seconds:
+
+<div align="center">
+
+<img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/progressive-acceleration-sdxl/SDXL%2C_Batch_Size%3A_1%2C_Steps%3A_30_0.png" width=500>
+
+</div>
+
+## Running inference in bfloat16
+
+Enable the first optimization: use a reduced precision to run the inference. 
+
+```python
+from diffusers import StableDiffusionXLPipeline
+import torch
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+	"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
+).to("cuda")
+
+# Run the attention ops without efficiency.
+pipe.unet.set_default_attn_processor()
+pipe.vae.set_default_attn_processor()
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+image = pipe(prompt, num_inference_steps=30).images[0]
+```
+
+bfloat16 reduces the latency from 7.36 seconds to 4.63 seconds:
+
+<div align="center">
+
+<img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/progressive-acceleration-sdxl/SDXL%2C_Batch_Size%3A_1%2C_Steps%3A_30_1.png" width=500>
+
+</div>
+
+_(We later ran the experiments in float16 and found out that the recent versions of torchao do not incur numerical problems from float16.)_
+
+**Why bfloat16?** 
+
+* Using a reduced numerical precision (such as float16, bfloat16) to run inference doesn’t affect the generation quality but significantly improves latency. 
+* The benefits of using the bfloat16 numerical precision as compared to float16 are hardware-dependent. Modern generations of GPUs tend to favor bfloat16. 
+* Furthermore, in our experiments, we bfloat16 to be much more resilient when used with quantization in comparison to float16.  
+
+We have a [dedicated guide](../optimization/fp16.md) for running inference in a reduced precision. 
+
+## Running attention efficiently
+
+Attention blocks are intensive to run. But with PyTorch's [`scaled_dot_product_attention`](../optimization/torch2.0.md), we can run them efficiently. 
+
+```python
+from diffusers import StableDiffusionXLPipeline
+import torch
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+	"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
+).to("cuda")
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+image = pipe(prompt, num_inference_steps=30).images[0]
+```
+
+`scaled_dot_product_attention` improves the latency from 4.63 seconds to 3.31 seconds.
+
+<div align="center">
+
+<img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/progressive-acceleration-sdxl/SDXL%2C_Batch_Size%3A_1%2C_Steps%3A_30_2.png" width=500>
+
+</div>
+
+## Use faster kernels with torch.compile
+
+Compile the UNet and the VAE to benefit from the faster kernels. First, configure a few compiler flags:
+
+```python
+from diffusers import StableDiffusionXLPipeline
+import torch
+
+torch._inductor.config.conv_1x1_as_mm = True
+torch._inductor.config.coordinate_descent_tuning = True
+torch._inductor.config.epilogue_fusion = False
+torch._inductor.config.coordinate_descent_check_all_directions = True
+```
+
+For the full list of compiler flags, refer to [this file](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/config.py).
+
+It is also important to change the memory layout of the UNet and the VAE to “channels_last” when compiling them. This ensures maximum speed:
+
+```python
+pipe.unet.to(memory_format=torch.channels_last)
+pipe.vae.to(memory_format=torch.channels_last)
+```
+
+Then, compile and perform inference:
+
+```python
+# Compile the UNet and VAE.
+pipe.unet = torch.compile(pipe.unet, mode="max-autotune", fullgraph=True)
+pipe.vae.decode = torch.compile(pipe.vae.decode, mode="max-autotune", fullgraph=True)
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+
+# First call to `pipe` will be slow, subsequent ones will be faster.
+image = pipe(prompt, num_inference_steps=30).images[0]
+```
+
+`torch.compile` offers different backends and modes. As we’re aiming for maximum inference speed, we opt for the inductor backend using the “max-autotune”. “max-autotune” uses CUDA graphs and optimizes the compilation graph specifically for latency. Specifying fullgraph to be True ensures that there are no graph breaks in the underlying model, ensuring the fullest potential of `torch.compile`. 
+
+Using SDPA attention and compiling both the UNet and VAE reduces the latency from 3.31 seconds to 2.54 seconds. 
+
+<div align="center">
+
+<img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/progressive-acceleration-sdxl/SDXL%2C_Batch_Size%3A_1%2C_Steps%3A_30_3.png" width=500>
+
+</div>
+
+## Combine the projection matrices of attention
+
+Both the UNet and the VAE used in SDXL make use of Transformer-like blocks. A Transformer block consists of attention blocks and feed-forward blocks. 
+
+In an attention block, the input is projected into three sub-spaces using three different projection matrices – Q, K, and V. In the naive implementation, these projections are performed separately on the input. But we can horizontally combine the projection matrices into a single matrix and perform the projection in one shot. This increases the size of the matmuls of the input projections and improves the impact of quantization (to be discussed next). 
+
+Enabling this kind of computation in Diffusers just takes a single line of code:
+
+```python
+pipe.fuse_qkv_projections()
+```
+
+It provides a minor boost from 2.54 seconds to 2.52 seconds. 
+
+<div align="center">
+
+<img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/progressive-acceleration-sdxl/SDXL%2C_Batch_Size%3A_1%2C_Steps%3A_30_4.png" width=500>
+
+</div>
+
+<Tip warning={true}>
+
+Support for `fuse_qkv_projections()` is limited and experimental. As such, it's not available for many non-SD pipelines such as [Kandinsky](../using-diffusers/kandinsky.md). You can refer to [this PR](https://github.com/huggingface/diffusers/pull/6179) to get an idea about how to support this kind of computation.
+
+</Tip>
+
+## Dynamic quantization
+
+Aapply [dynamic int8 quantization](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html) to both the UNet and the VAE. This is because quantization adds additional conversion overhead to the model that is hopefully made up for by faster matmuls (dynamic quantization). If the matmuls are too small, these techniques may degrade performance.
+
+<Tip>
+
+Through experimentation, we found that certain linear layers in the UNet and the VAE don’t benefit from dynamic int8 quantization. You can check out the full code for filtering those layers [here](https://github.com/huggingface/diffusion-fast/blob/0f169640b1db106fe6a479f78c1ed3bfaeba3386/utils/pipeline_utils.py#L16) (referred to as `dynamic_quant_filter_fn` below). 
+
+</Tip>
+
+You will leverage the ultra-lightweight pure PyTorch library [torchao](https://github.com/pytorch-labs/ao) to use its user-friendly APIs for quantization. 
+
+First, configure all the compiler tags:
+
+```python
+from diffusers import StableDiffusionXLPipeline
+import torch 
+
+# Notice the two new flags at the end.
+torch._inductor.config.conv_1x1_as_mm = True
+torch._inductor.config.coordinate_descent_tuning = True
+torch._inductor.config.epilogue_fusion = False
+torch._inductor.config.coordinate_descent_check_all_directions = True
+torch._inductor.config.force_fuse_int_mm_with_mul = True
+torch._inductor.config.use_mixed_mm = True
+```
+
+Define the filtering functions:
+
+```python
+def dynamic_quant_filter_fn(mod, *args):
+    return (
+        isinstance(mod, torch.nn.Linear)
+        and mod.in_features > 16
+        and (mod.in_features, mod.out_features)
+        not in [
+            (1280, 640),
+            (1920, 1280),
+            (1920, 640),
+            (2048, 1280),
+            (2048, 2560),
+            (2560, 1280),
+            (256, 128),
+            (2816, 1280),
+            (320, 640),
+            (512, 1536),
+            (512, 256),
+            (512, 512),
+            (640, 1280),
+            (640, 1920),
+            (640, 320),
+            (640, 5120),
+            (640, 640),
+            (960, 320),
+            (960, 640),
+        ]
+    )
+
+
+def conv_filter_fn(mod, *args):
+    return (
+        isinstance(mod, torch.nn.Conv2d) and mod.kernel_size == (1, 1) and 128 in [mod.in_channels, mod.out_channels]
+    )
+```
+
+Then apply all the optimizations discussed so far:
+
+```python
+# SDPA + bfloat16.
+pipe = StableDiffusionXLPipeline.from_pretrained(
+	"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
+).to("cuda")
+
+# Combine attention projection matrices.
+pipe.fuse_qkv_projections()
+
+# Change the memory layout.
+pipe.unet.to(memory_format=torch.channels_last)
+pipe.vae.to(memory_format=torch.channels_last)
+```
+
+Since this quantization support is limited to linear layers only, we also turn suitable pointwise convolution layers into linear layers to maximize the benefit.
+
+```python
+from torchao import swap_conv2d_1x1_to_linear
+
+swap_conv2d_1x1_to_linear(pipe.unet, conv_filter_fn)
+swap_conv2d_1x1_to_linear(pipe.vae, conv_filter_fn)
+```
+
+Apply dynamic quantization:
+
+```python
+from torchao import apply_dynamic_quant
+
+apply_dynamic_quant(pipe.unet, dynamic_quant_filter_fn)
+apply_dynamic_quant(pipe.vae, dynamic_quant_filter_fn)
+```
+
+Finally, compile and perform inference:
+
+```python
+pipe.unet = torch.compile(pipe.unet, mode="max-autotune", fullgraph=True)
+pipe.vae.decode = torch.compile(pipe.vae.decode, mode="max-autotune", fullgraph=True)
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+image = pipe(prompt, num_inference_steps=30).images[0]
+```
+
+Applying dynamic quantization improves the latency from 2.52 seconds to 2.43 seconds.
+
+<div align="center">
+
+<img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/progressive-acceleration-sdxl/SDXL%2C_Batch_Size%3A_1%2C_Steps%3A_30_5.png" width=500>
+
+</div>
+
+## Misc
+
+### No graph breaks during torch.compile
+
+Ensuring that the underlying model/method can be fully compiled is crucial for performance (torch.compile with fullgraph=True). This means having no graph breaks. We did this for the UNet and VAE by changing how we access the returning variables. Consider the following example: 
+
+```diff
+- latents = unet(
+-	latents, timestep=timestep, encoder_hidden_states=prompt_embeds
+-).sample
+
+ latents = unet(
+	latents, timestep=timestep, encoder_hidden_states=prompt_embeds, return_dict=False
+)[0]
+```
+
+### Getting rid of GPU syncs after compilation
+
+During the iterative reverse diffusion process, we [call](https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L1228) `step()` on the scheduler each time after the denoiser predicts the less noisy latent embeddings. Inside `step()`, the `sigmas` variable is [indexed](https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/schedulers/scheduling_euler_discrete.py#L476). If the `sigmas` array is placed on the GPU, indexing causes a communication sync between the CPU and GPU. This causes a latency, and it becomes more evident when the denoiser has already been compiled. 
+
+But if the `sigmas` array always stays on the CPU (refer to [this line](https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240)), this sync doesn’t take place, hence improved latency. In general, any CPU <-> GPU communication sync should be none or be kept to a bare minimum as it can impact inference latency. 
--- a/docs/source/en/tutorials/using_peft_for_inference.md
+++ b/docs/source/en/tutorials/using_peft_for_inference.md
@@ -183,3 +183,26 @@ image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).ima
 # Gets the Unet back to the original state
 pipe.unfuse_lora()
 ```
+
+You can also fuse some adapters using `adapter_names` for faster generation:
+
+```py
+pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
+
+pipe.set_adapters(["pixel"], adapter_weights=[0.5, 1.0])
+# Fuses the LoRAs into the Unet
+pipe.fuse_lora(adapter_names=["pixel"])
+
+prompt = "a hacker with a hoodie, pixel art"
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+
+# Gets the Unet back to the original state
+pipe.unfuse_lora()
+
+# Fuse all adapters
+pipe.fuse_lora(adapter_names=["pixel", "toy"])
+
+prompt = "toy_face of a hacker with a hoodie, pixel art"
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+```
--- a/docs/source/en/using-diffusers/callback.md
+++ b/docs/source/en/using-diffusers/callback.md
@@ -63,3 +63,42 @@ With callbacks, you can implement features such as dynamic CFG without having to
 🤗 Diffusers currently only supports `callback_on_step_end`, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require a callback function with a different execution point!

 </Tip>
+
+
+## Using Callbacks to interrupt the Diffusion Process
+
+The following Pipelines support interrupting the diffusion process via callback
+
+- [StableDiffusionPipeline](../api/pipelines/stable_diffusion/overview.md)
+- [StableDiffusionImg2ImgPipeline](..api/pipelines/stable_diffusion/img2img.md)
+- [StableDiffusionInpaintPipeline](..api/pipelines/stable_diffusion/inpaint.md)
+- [StableDiffusionXLPipeline](../api/pipelines/stable_diffusion/stable_diffusion_xl.md)
+- [StableDiffusionXLImg2ImgPipeline](../api/pipelines/stable_diffusion/stable_diffusion_xl.md)
+- [StableDiffusionXLInpaintPipeline](../api/pipelines/stable_diffusion/stable_diffusion_xl.md)
+
+Interrupting the diffusion process is particularly useful when building UIs that work with Diffusers because it allows users to stop the generation process if they're unhappy with the intermediate results. You can incorporate this into your pipeline with a callback.
+
+This callback function should take the following arguments: `pipe`, `i`, `t`, and `callback_kwargs` (this must be returned). Set the pipeline's `_interrupt` attribute to `True` to stop the diffusion process after a certain number of steps. You are also free to implement your own custom stopping logic inside the callback.
+
+In this example, the diffusion process is stopped after 10 steps even though `num_inference_steps` is set to 50.
+
+```python
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+pipe.enable_model_cpu_offload()
+num_inference_steps = 50
+
+def interrupt_callback(pipe, i, t, callback_kwargs):
+    stop_idx = 10
+    if i == stop_idx:
+        pipe._interrupt = True
+
+    return callback_kwargs
+
+pipe(
+    "A photo of a cat",
+    num_inference_steps=num_inference_steps,
+    callback_on_step_end=interrupt_callback,
+)
+```
--- a/docs/source/en/using-diffusers/controlnet.md
+++ b/docs/source/en/using-diffusers/controlnet.md
@@ -203,7 +203,7 @@ def make_inpaint_condition(image, image_mask):
    image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0

    assert image.shape[0:1] == image_mask.shape[0:1]
-    image[image_mask > 0.5] = 1.0  # set as masked pixel
+    image[image_mask > 0.5] = -1.0  # set as masked pixel
    image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
    image = torch.from_numpy(image)
    return image
--- a/docs/source/en/using-diffusers/kandinsky.md
+++ b/docs/source/en/using-diffusers/kandinsky.md
@@ -20,6 +20,8 @@ The Kandinsky models are a series of multilingual text-to-image generation model

 [Kandinsky 2.2](../api/pipelines/kandinsky_v22) improves on the previous model by replacing the image encoder of the image prior model with a larger CLIP-ViT-G model to improve quality. The image prior model was also retrained on images with different resolutions and aspect ratios to generate higher-resolution images and different image sizes.

+[Kandinsky 3](../api/pipelines/kandinsky3) simplifies the architecture and shifts away from the two-stage generation process involving the prior model and diffusion model. Instead, Kandinsky 3 uses [Flan-UL2](https://huggingface.co/google/flan-ul2) to encode text, a UNet with [BigGan-deep](https://hf.co/papers/1809.11096) blocks, and [Sber-MoVQGAN](https://github.com/ai-forever/MoVQGAN) to decode the latents into images. Text understanding and generated image quality are primarily achieved by using a larger text encoder and UNet.
+
 This guide will show you how to use the Kandinsky models for text-to-image, image-to-image, inpainting, interpolation, and more.

 Before you begin, make sure you have the following libraries installed:
@@ -33,6 +35,10 @@ Before you begin, make sure you have the following libraries installed:

 Kandinsky 2.1 and 2.2 usage is very similar! The only difference is Kandinsky 2.2 doesn't accept `prompt` as an input when decoding the latents. Instead, Kandinsky 2.2 only accepts `image_embeds` during decoding.

+<br>
+
+Kandinsky 3 has a more concise architecture and it doesn't require a prior model. This means it's usage is identical to other diffusion models like [Stable Diffusion XL](sdxl).
+
 </Tip>

 ## Text-to-image
@@ -91,6 +97,23 @@ image
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-text-to-image.png"/>
 </div>

+</hfoption>
+<hfoption id="Kandinsky 3">
+
+Kandinsky 3 doesn't require a prior model so you can directly load the [`Kandinsky3Pipeline`] and pass a prompt to generate an image:
+
+```py
+from diffusers import Kandinsky3Pipeline
+import torch
+
+pipeline = Kandinsky3Pipeline.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
+pipeline.enable_model_cpu_offload()
+
+prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting"
+image = pipeline(prompt).images[0]
+image
+```
+
 </hfoption>
 </hfoptions>

@@ -161,6 +184,20 @@ prior_pipeline = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kan
 pipeline = KandinskyV22Img2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
 ```

+</hfoption>
+<hfoption id="Kandinsky 3">
+
+Kandinsky 3 doesn't require a prior model so you can directly load the image-to-image pipeline:
+
+```py
+from diffusers import Kandinsky3Img2ImgPipeline
+from diffusers.utils import load_image
+import torch
+
+pipeline = Kandinsky3Img2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
+pipeline.enable_model_cpu_offload()
+```
+
 </hfoption>
 </hfoptions>

@@ -218,6 +255,14 @@ make_image_grid([original_image.resize((512, 512)), image.resize((512, 512))], r
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-image-to-image.png"/>
 </div>

+</hfoption>
+<hfoption id="Kandinsky 3">
+
+```py
+image = pipeline(prompt, negative_prompt=negative_prompt, image=image, strength=0.75, num_inference_steps=25).images[0]
+image
+```
+
 </hfoption>
 </hfoptions>

--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -307,3 +307,394 @@ prompt = "a house by william eggleston, sunrays, beautiful, sunlight, sunrays, b
 image = pipeline(prompt=prompt).images[0]
 image
 ```
+
+## IP-Adapter 
+
+[IP-Adapter](https://ip-adapter.github.io/) is an effective and lightweight adapter that adds image prompting capabilities to a diffusion model. This adapter works by decoupling the cross-attention layers of the image and text features. All the other model components are frozen and only the embedded image features in the UNet are trained. As a result, IP-Adapter files are typically only ~100MBs.
+
+IP-Adapter works with most of our pipelines, including Stable Diffusion, Stable Diffusion XL (SDXL), ControlNet, T2I-Adapter, AnimateDiff.  And you can use any custom models finetuned from the same base models. It also works with LCM-Lora out of box.
+
+
+<Tip>
+
+You can find official IP-Adapter checkpoints in [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter).
+
+IP-Adapter was contributed by [okotaku](https://github.com/okotaku).
+
+</Tip>
+
+Let's first create a Stable Diffusion Pipeline.
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+from diffusers.utils import load_image
+
+
+pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
+```
+
+Now load the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) weights with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method. 
+
+```py
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+```
+
+<Tip>
+IP-Adapter relies on an image encoder to generate the image features, if your IP-Adapter weights folder contains a "image_encoder" subfolder, the image encoder will be automatically loaded and registered to the pipeline. Otherwise you can so load a [`~transformers.CLIPVisionModelWithProjection`] model and  pass it to a Stable Diffusion pipeline when you create it.
+
+```py
+from diffusers import AutoPipelineForText2Image, CLIPVisionModelWithProjection
+import torch
+
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    "h94/IP-Adapter", 
+    subfolder="models/image_encoder",
+    torch_dtype=torch.float16,
+).to("cuda")
+
+pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=torch.float16).to("cuda")
+```
+</Tip>
+
+IP-Adapter allows you to use both image and text to condition the image generation process. For example, let's use the bear image from the [Textual Inversion](#textual-inversion) section as the image prompt (`ip_adapter_image`) along with a text prompt to add "sunglasses". 😎
+
+```py
+pipeline.set_ip_adapter_scale(0.6)
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png")
+generator = torch.Generator(device="cpu").manual_seed(33)
+images = pipeline(
+    prompt='best quality, high quality, wearing sunglasses', 
+    ip_adapter_image=image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+    num_inference_steps=50,
+    generator=generator,
+).images
+images[0]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip-bear.png" />
+</div>
+
+<Tip>
+
+You can use the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method to adjust the text prompt and image prompt condition ratio.  If you're only using the image prompt, you should set the scale to `1.0`. You can lower the scale to get more generation diversity, but it'll be less aligned with the prompt.
+`scale=0.5` can achieve good results in most cases when you use both text and image prompts.
+</Tip>
+
+IP-Adapter also works great with Image-to-Image and Inpainting pipelines. See below examples of how you can use it with Image-to-Image and Inpaint.
+
+<hfoptions id="tasks">
+<hfoption id="image-to-image">
+
+```py
+from diffusers import AutoPipelineForImage2Image
+import torch
+from diffusers.utils import load_image
+
+pipeline = AutoPipelineForImage2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
+
+image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
+ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
+
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+generator = torch.Generator(device="cpu").manual_seed(33)
+images = pipeline(
+    prompt='best quality, high quality', 
+    image = image,
+    ip_adapter_image=ip_image,
+    num_inference_steps=50,
+    generator=generator,
+    strength=0.6,
+).images
+images[0]
+```
+
+</hfoption>
+<hfoption id="inpaint">
+
+```py
+from diffusers import AutoPipelineForInpaint
+import torch
+from diffusers.utils import load_image
+
+pipeline = AutoPipelineForInpaint.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float).to("cuda")
+
+image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/inpaint_image.png")
+mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/mask.png")
+ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/girl.png")
+
+image = image.resize((512, 768))
+mask = mask.resize((512, 768))
+
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+generator = torch.Generator(device="cpu").manual_seed(33)
+images = pipeline(
+    prompt='best quality, high quality', 
+    image = image,
+    mask_image = mask,
+    ip_adapter_image=ip_image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+    num_inference_steps=50,
+    generator=generator,
+    strength=0.5,
+).images
+images[0]
+```
+</hfoption>
+</hfoptions>
+
+
+IP-Adapters can also be used with [SDXL](../api/pipelines/stable_diffusion/stable_diffusion_xl.md)
+
+```python
+from diffusers import AutoPipelineForText2Image
+from diffusers.utils import load_image
+import torch
+
+pipeline = AutoPipelineForText2Image.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16
+).to("cuda")
+
+image = load_image("https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/watercolor_painting.jpeg")
+
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+
+generator = torch.Generator(device="cpu").manual_seed(33)
+image = pipeline(
+    prompt="best quality, high quality", 
+    ip_adapter_image=image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+    num_inference_steps=25,
+    generator=generator,
+).images[0]
+image.save("sdxl_t2i.png")
+```
+
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/watercolor_painting.jpeg"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">input image</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/sdxl_t2i.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">adapted image</figcaption>
+  </div>
+</div>
+
+You can use the IP-Adapter face model to apply specific faces to your images.  It is an effective way to maintain consistent characters in your image generations.
+Weights are loaded with the same method used for the other IP-Adapters.  
+
+```python
+# Load ip-adapter-full-face_sd15.bin
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin")
+```
+
+<Tip>
+
+It is recommended to use `DDIMScheduler` and `EulerDiscreteScheduler` for face model. 
+
+
+</Tip>
+
+```python
+import torch
+from diffusers import StableDiffusionPipeline, DDIMScheduler
+from diffusers.utils import load_image
+
+noise_scheduler = DDIMScheduler(
+    num_train_timesteps=1000,
+    beta_start=0.00085,
+    beta_end=0.012,
+    beta_schedule="scaled_linear",
+    clip_sample=False,
+    set_alpha_to_one=False,
+    steps_offset=1
+)
+
+pipeline = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    scheduler=noise_scheduler,
+).to("cuda")
+
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin")
+
+pipeline.set_ip_adapter_scale(0.7)
+
+image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png")
+
+generator = torch.Generator(device="cpu").manual_seed(33)
+
+image = pipeline(
+    prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
+    ip_adapter_image=image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+    num_inference_steps=50, num_images_per_prompt=1, width=512, height=704,
+    generator=generator,
+).images[0]
+```
+
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">input image</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ipadapter_full_face_output.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">output image</figcaption>
+  </div>
+</div>
+
+### LCM-Lora
+
+You can use IP-Adapter with LCM-Lora to achieve "instant fine-tune" with custom images. Note that you need to load IP-Adapter weights before loading the LCM-Lora weights.
+
+```py
+from diffusers import DiffusionPipeline, LCMScheduler
+import torch
+from diffusers.utils import load_image
+
+model_id =  "sd-dreambooth-library/herge-style"
+lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
+
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+
+pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+pipe.load_lora_weights(lcm_lora_id)
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
+
+prompt = "best quality, high quality"
+image = load_image("https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png")
+images = pipe(
+    prompt=prompt,
+    ip_adapter_image=image,
+    num_inference_steps=4,
+    guidance_scale=1,
+).images[0]
+```
+
+### Other pipelines
+
+IP-Adapter is compatible with any pipeline that (1) uses a text prompt and (2) uses Stable Diffusion or Stable Diffusion XL checkpoint. To use IP-Adapter with a different pipeline, all you need to do is to run `load_ip_adapter()` method after you create the pipeline, and then pass your image to the pipeline as `ip_adapter_image`
+
+<Tip>
+
+🤗 Diffusers currently only supports using IP-Adapter with some of the most popular pipelines, feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require integrating IP-adapters with a pipeline that does not support it yet!
+
+</Tip>
+
+You can find below examples on how to use IP-Adapter with ControlNet and AnimateDiff. 
+
+<hfoptions id="model">
+<hfoption id="ControlNet">
+
+```
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+import torch
+from diffusers.utils import load_image
+
+controlnet_model_path = "lllyasviel/control_v11f1p_sd15_depth"
+controlnet = ControlNetModel.from_pretrained(controlnet_model_path, torch_dtype=torch.float16)
+
+pipeline = StableDiffusionControlNetPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16)
+pipeline.to("cuda")
+
+image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png")
+depth_map = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/depth.png")
+
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+generator = torch.Generator(device="cpu").manual_seed(33)
+images = pipeline(
+    prompt='best quality, high quality', 
+    image=depth_map,
+    ip_adapter_image=image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+    num_inference_steps=50,
+    generator=generator,
+).images
+images[0]
+```
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">input image</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ipa-controlnet-out.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">adapted image</figcaption>
+  </div>
+</div>
+
+</hfoption>
+<hfoption id="AnimateDiff">
+
+```py
+# animate diff + ip adapter
+import torch
+from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+from diffusers.utils import export_to_gif, load_image
+
+# Load the motion adapter
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
+# load SD 1.5 based finetuned model
+model_id = "Lykon/DreamShaper"
+pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)
+
+# scheduler
+scheduler = DDIMScheduler(
+    clip_sample=False,
+    beta_start=0.00085,
+    beta_end=0.012,
+    beta_schedule="linear",
+    timestep_spacing="trailing",
+    steps_offset=1
+)
+pipe.scheduler = scheduler
+
+# enable memory savings
+pipe.enable_vae_slicing()
+pipe.enable_model_cpu_offload()
+
+# load ip_adapter
+pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+
+# load motion adapters
+pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-out", adapter_name="zoom-out")
+pipe.load_lora_weights("guoyww/animatediff-motion-lora-tilt-up", adapter_name="tilt-up")
+pipe.load_lora_weights("guoyww/animatediff-motion-lora-pan-left", adapter_name="pan-left")
+
+seed = 42
+image = load_image("https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png")
+images = [image] * 3
+prompts = ["best quality, high quality"] * 3
+negative_prompt = "bad quality, worst quality"
+adapter_weights = [[0.75, 0.0, 0.0], [0.0, 0.0, 0.75], [0.0, 0.75, 0.75]]
+
+# generate
+output_frames = []
+for prompt, image, adapter_weight in zip(prompts, images, adapter_weights):
+    pipe.set_adapters(["zoom-out", "tilt-up", "pan-left"], adapter_weights=adapter_weight)
+    output = pipe(
+      prompt= prompt,
+      num_frames=16,
+      guidance_scale=7.5,
+      num_inference_steps=30,
+      ip_adapter_image = image,
+      generator=torch.Generator("cpu").manual_seed(seed),
+    )
+    frames = output.frames[0]
+    output_frames.extend(frames)
+
+export_to_gif(output_frames, "test_out_animation.gif") 
+```
+
+</hfoption>
+</hfoptions>
+
--- a/docs/source/en/using-diffusers/push_to_hub.md
+++ b/docs/source/en/using-diffusers/push_to_hub.md
@@ -174,10 +174,4 @@ Set `private=True` in the [`~diffusers.utils.PushToHubMixin.push_to_hub`] functi
 controlnet.push_to_hub("my-controlnet-model-private", private=True)
 ```

-Private repositories are only visible to you, and other users won't be able to clone the repository and your repository won't appear in search results. Even if a user has the URL to your private repository, they'll receive a `404 - Sorry, we can't find the page you are looking for.`
-
-To load a model, scheduler, or pipeline from private or gated repositories, set `use_auth_token=True`:
-
-```py
-model = ControlNetModel.from_pretrained("your-namespace/my-controlnet-model-private", use_auth_token=True)
-```
+Private repositories are only visible to you, and other users won't be able to clone the repository and your repository won't appear in search results. Even if a user has the URL to your private repository, they'll receive a `404 - Sorry, we can't find the page you are looking for`. You must be [logged in](https://huggingface.co/docs/huggingface_hub/quick-start#login) to load a model from a private repository.
--- a/docs/source/en/using-diffusers/reusing_seeds.md
+++ b/docs/source/en/using-diffusers/reusing_seeds.md
@@ -41,6 +41,20 @@ Now, define four different `Generator`s and assign each `Generator` a seed (`0`
 generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(4)]
 ```

+<Tip warning={true}>
+
+To create a batched seed, you should use a list comprehension that iterates over the length specified in `range()`. This creates a unique `Generator` object for each image in the batch. If you only multiply the `Generator` by the batch size, this only creates one `Generator` object that is used sequentially for each image in the batch.
+
+For example, if you want to use the same seed to create 4 identical images:
+
+```py
+❌ [torch.Generator().manual_seed(seed)] * 4
+
+✅ [torch.Generator().manual_seed(seed) for _ in range(4)]
+```
+
+</Tip>
+
 Generate the images and have a look:

 ```python
--- a/docs/source/en/using-diffusers/sdxl_turbo.md
+++ b/docs/source/en/using-diffusers/sdxl_turbo.md
@@ -0,0 +1,116 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Stable Diffusion XL Turbo
+
+[[open-in-colab]]
+
+SDXL Turbo is an adversarial time-distilled [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) (SDXL) model capable
+of running inference in as little as 1 step.
+
+This guide will show you how to use SDXL-Turbo for text-to-image and image-to-image.
+
+Before you begin, make sure you have the following libraries installed:
+
+```py
+# uncomment to install the necessary libraries in Colab
+#!pip install -q diffusers transformers accelerate omegaconf
+```
+
+## Load model checkpoints
+
+Model weights may be stored in separate subfolders on the Hub or locally, in which case, you should use the [`~StableDiffusionXLPipeline.from_pretrained`] method:
+
+```py
+from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
+import torch
+
+pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16")
+pipeline = pipeline.to("cuda")
+```
+
+You can also use the [`~StableDiffusionXLPipeline.from_single_file`] method to load a model checkpoint stored in a single file format (`.ckpt` or `.safetensors`) from the Hub or locally:
+
+```py
+from diffusers import StableDiffusionXLPipeline
+import torch
+
+pipeline = StableDiffusionXLPipeline.from_single_file(
+    "https://huggingface.co/stabilityai/sdxl-turbo/blob/main/sd_xl_turbo_1.0_fp16.safetensors", torch_dtype=torch.float16)
+pipeline = pipeline.to("cuda")
+```
+
+## Text-to-image
+
+For text-to-image, pass a text prompt. By default, SDXL Turbo generates a 512x512 image, and that resolution gives the best results. You can try setting the `height` and `width` parameters to 768x768 or 1024x1024, but you should expect quality degradations when doing so.
+
+Make sure to set `guidance_scale` to 0.0 to disable, as the model was trained without it. A single inference step is enough to generate high quality images. 
+Increasing the number of steps to 2, 3 or 4 should improve image quality.
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipeline_text2image = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16")
+pipeline_text2image = pipeline_text2image.to("cuda")
+
+prompt = "A cinematic shot of a baby racoon wearing an intricate italian priest robe."
+
+image = pipeline_text2image(prompt=prompt, guidance_scale=0.0, num_inference_steps=1).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sdxl-turbo-text2img.png" alt="generated image of a racoon in a robe"/>
+</div>
+
+## Image-to-image
+
+For image-to-image generation, make sure that `num_inference_steps * strength` is larger or equal to 1. 
+The image-to-image pipeline will run for `int(num_inference_steps * strength)` steps, e.g. `0.5 * 2.0 = 1` step in
+our example below.
+
+```py
+from diffusers import AutoPipelineForImage2Image
+from diffusers.utils import load_image, make_image_grid
+
+# use from_pipe to avoid consuming additional memory when loading a checkpoint
+pipeline = AutoPipelineForImage2Image.from_pipe(pipeline_text2image).to("cuda")
+
+init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png")
+init_image = init_image.resize((512, 512))
+
+prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
+
+image = pipeline(prompt, image=init_image, strength=0.5, guidance_scale=0.0, num_inference_steps=2).images[0]
+make_image_grid([init_image, image], rows=1, cols=2)
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sdxl-turbo-img2img.png" alt="Image-to-image generation sample using SDXL Turbo"/>
+</div>
+
+## Speed-up SDXL Turbo even more
+
+- Compile the UNet if you are using PyTorch version 2 or better. The first inference run will be very slow, but subsequent ones will be much faster.
+
+```py
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+```
+
+- When using the default VAE, keep it in `float32` to avoid costly `dtype` conversions before and after each generation. You only need to do this one before your first generation:
+
+```py
+pipe.upcast_vae()
+```
+
+As an alternative, you can also use a [16-bit VAE](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) created by community member [`@madebyollin`](https://huggingface.co/madebyollin) that does not need to be upcasted to `float32`.
--- a/docs/source/en/using-diffusers/svd.md
+++ b/docs/source/en/using-diffusers/svd.md
@@ -0,0 +1,137 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Stable Video Diffusion
+
+[[open-in-colab]]
+
+[Stable Video Diffusion](https://static1.squarespace.com/static/6213c340453c3f502425776e/t/655ce779b9d47d342a93c890/1700587395994/stable_video_diffusion.pdf) is a powerful image-to-video generation model that can generate high resolution (576x1024) 2-4 second videos conditioned on the input image.
+
+This guide will show you how to use SVD to short generate videos from images.
+
+Before you begin, make sure you have the following libraries installed:
+
+```py
+!pip install -q -U diffusers transformers accelerate 
+```
+
+## Image to Video Generation
+
+The are two variants of SVD. [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) 
+and [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt). The svd checkpoint is trained to generate 14 frames and the svd-xt checkpoint is further 
+finetuned to generate 25 frames.
+
+We will use the `svd-xt` checkpoint for this guide.
+
+```python
+import torch
+
+from diffusers import StableVideoDiffusionPipeline
+from diffusers.utils import load_image, export_to_video
+
+pipe = StableVideoDiffusionPipeline.from_pretrained(
+    "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
+)
+pipe.enable_model_cpu_offload()
+
+# Load the conditioning image
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
+image = image.resize((1024, 576))
+
+generator = torch.manual_seed(42)
+frames = pipe(image, decode_chunk_size=8, generator=generator).frames[0]
+
+export_to_video(frames, "generated.mp4", fps=7)
+```
+
+<video controls width="1024" height="576">
+  <source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket_generated.webm" type="video/webm" />
+  <source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket_generated.mp4" type="video/mp4" />
+</video>
+
+| **Source Image** | **Video** |
+|:------------:|:-----:|
+|     ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png)      |  ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/output_rocket.gif)  |
+
+
+<Tip>
+Since generating videos is more memory intensive we can use the `decode_chunk_size` argument to control how many frames are decoded at once. This will reduce the memory usage. It's recommended to tweak this value based on your GPU memory.
+Setting `decode_chunk_size=1` will decode one frame at a time and will use the least amount of memory but the video might have some flickering.
+
+Additionally, we also use [model cpu offloading](../../optimization/memory#model-offloading) to reduce the memory usage.
+</Tip>
+
+
+### Torch.compile
+
+You can achieve a 20-25% speed-up at the expense of slightly increased memory by compiling the UNet as follows:
+
+```diff
+- pipe.enable_model_cpu_offload()
+ pipe.to("cuda")
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+```
+
+### Low-memory
+
+Video generation is very memory intensive as we have to essentially generate `num_frames` all at once. The mechanism is very comparable to text-to-image generation with a high batch size. To reduce the memory requirement you have multiple options. The following options trade inference speed against lower memory requirement:
+- enable model offloading: Each component of the pipeline is offloaded to CPU once it's not needed anymore.
+- enable feed-forward chunking: The feed-forward layer runs in a loop instead of running with a single huge feed-forward batch size
+- reduce `decode_chunk_size`: This means that the VAE decodes frames in chunks instead of decoding them all together. **Note**: In addition to leading to a small slowdown, this method also slightly leads to video quality deterioration
+
+You can enable them as follows:
+
+```diff
+-pipe.enable_model_cpu_offload()
+-frames = pipe(image, decode_chunk_size=8, generator=generator).frames[0]
+pipe.enable_model_cpu_offload()
+pipe.unet.enable_forward_chunking()
+frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
+```
+
+
+Including all these tricks should lower the memory requirement to less than 8GB VRAM.
+
+### Micro-conditioning
+
+Along with conditioning image Stable Diffusion Video also allows providing micro-conditioning that allows more control over the generated video.
+It accepts the following arguments:
+
+- `fps`: The frames per second of the generated video.
+- `motion_bucket_id`: The motion bucket id to use for the generated video. This can be used to control the motion of the generated video. Increasing the motion bucket id will increase the motion of the generated video.
+- `noise_aug_strength`: The amount of noise added to the conditioning image. The higher the values the less the video will resemble the conditioning image. Increasing this value will also increase the motion of the generated video.
+
+Here is an example of using micro-conditioning to generate a video with more motion.
+
+
+```python
+import torch
+
+from diffusers import StableVideoDiffusionPipeline
+from diffusers.utils import load_image, export_to_video
+
+pipe = StableVideoDiffusionPipeline.from_pretrained(
+  "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
+)
+pipe.enable_model_cpu_offload()
+
+# Load the conditioning image
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
+image = image.resize((1024, 576))
+
+generator = torch.manual_seed(42)
+frames = pipe(image, decode_chunk_size=8, generator=generator, motion_bucket_id=180, noise_aug_strength=0.1).frames[0]
+export_to_video(frames, "generated.mp4", fps=7)
+```
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/output_rocket_with_conditions.gif)
+
--- a/docs/source/en/using-diffusers/unconditional_image_generation.md
+++ b/docs/source/en/using-diffusers/unconditional_image_generation.md
@@ -14,54 +14,41 @@ specific language governing permissions and limitations under the License.

 [[open-in-colab]]

-Unconditional image generation is a relatively straightforward task. The model only generates images - without any additional context like text or an image - resembling the training data it was trained on.
+Unconditional image generation generates images that look like a random sample from the training data the model was trained on because the denoising process is not guided by any additional context like text or image.

-The [`DiffusionPipeline`] is the easiest way to use a pre-trained diffusion system for inference.
+To get started, use the [`DiffusionPipeline`] to load the [anton-l/ddpm-butterflies-128](https://huggingface.co/anton-l/ddpm-butterflies-128) checkpoint to generate images of butterflies. The [`DiffusionPipeline`] downloads and caches all the model components required to generate an image.

-Start by creating an instance of [`DiffusionPipeline`] and specify which pipeline checkpoint you would like to download.
-You can use any of the 🧨 Diffusers [checkpoints](https://huggingface.co/models?library=diffusers&sort=downloads) from the Hub (the checkpoint you'll use generates images of butterflies).
-
-<Tip>
-
-💡 Want to train your own unconditional image generation model? Take a look at the training [guide](../training/unconditional_training) to learn how to generate your own images.
-
-</Tip>
-
-In this guide, you'll use [`DiffusionPipeline`] for unconditional image generation with [DDPM](https://arxiv.org/abs/2006.11239):
-
-```python
+```py
 from diffusers import DiffusionPipeline

-generator = DiffusionPipeline.from_pretrained("anton-l/ddpm-butterflies-128", use_safetensors=True)
-```
-
-The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components.
-Because the model consists of roughly 1.4 billion parameters, we strongly recommend running it on a GPU.
-You can move the generator object to a GPU, just like you would in PyTorch:
-
-```python
-generator.to("cuda")
-```
-
-Now you can use the `generator` to generate an image:
-
-```python
+generator = DiffusionPipeline.from_pretrained("anton-l/ddpm-butterflies-128").to("cuda")
 image = generator().images[0]
 image
 ```

-The output is by default wrapped into a [`PIL.Image`](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class) object.
+<Tip>

-You can save the image by calling:
+Want to generate images of something else? Take a look at the training [guide](../training/unconditional_training) to learn how to train a model to generate your own images.

-```python
+</Tip>
+
+The output image is a [`PIL.Image`](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class) object that can be saved:
+
+```py
 image.save("generated_image.png")
 ```

-Try out the Spaces below, and feel free to play around with the inference steps parameter to see how it affects the image quality!
+You can also try experimenting with the `num_inference_steps` parameter, which controls the number of denoising steps. More denoising steps typically produce higher quality images, but it'll take longer to generate. Feel free to play around with this parameter to see how it affects the image quality.
+
+```py
+image = generator(num_inference_steps=100).images[0]
+image
+```
+
+Try out the Space below to generate an image of a butterfly!

 <iframe
-	src="https://stevhliu-ddpm-butterflies-128.hf.space"
+	src="https://stevhliu-unconditional-image-generation.hf.space"
 	frameborder="0"
 	width="850"
 	height="500"
--- a/docs/source/ja/index.md
+++ b/docs/source/ja/index.md
@@ -96,3 +96,4 @@ specific language governing permissions and limitations under the License.
 | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation |
 | [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
 | [stable_diffusion_ldm3d](./api/pipelines/stable_diffusion/ldm3d_diffusion) | [LDM3D: Latent Diffusion Model for 3D](https://arxiv.org/abs/2305.10853) | Text to Image and Depth Generation |
+| [stable_diffusion_upscaler_ldm3d](./api/pipelines/stable_diffusion/ldm3d_diffusion) | [LDM3D-VR: Latent Diffusion Model for 3D VR](https://arxiv.org/pdf/2311.03226) | Image and Depth Upscaling |
--- a/examples/README.md
+++ b/examples/README.md
@@ -18,8 +18,7 @@ limitations under the License.
 Diffusers examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
 for a variety of use cases involving training or fine-tuning.

-**Note**: If you are looking for **official** examples on how to use `diffusers` for inference, 
-please have a look at [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
+**Note**: If you are looking for **official** examples on how to use `diffusers` for inference, please have a look at [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).

 Our examples aspire to be **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**.
 More specifically, this means:
@@ -27,11 +26,10 @@ More specifically, this means:
 - **Self-contained**: An example script shall only depend on "pip-install-able" Python packages that can be found in a `requirements.txt` file. Example scripts shall **not** depend on any local files. This means that one can simply download an example script, *e.g.* [train_unconditional.py](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py), install the required dependencies, *e.g.* [requirements.txt](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/requirements.txt) and execute the example script.
 - **Easy-to-tweak**: While we strive to present as many use cases as possible, the example scripts are just that - examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, most of the examples fully expose the preprocessing of the data and the training loop to allow you to tweak and edit them as required.
 - **Beginner-friendly**: We do not aim for providing state-of-the-art training scripts for the newest models, but rather examples that can be used as a way to better understand diffusion models and how to use them with the `diffusers` library. We often purposefully leave out certain state-of-the-art methods if we consider them too complex for beginners.
- **One-purpose-only**: Examples should show one task and one task only. Even if a task is from a modeling 
-point of view very similar, *e.g.* image super-resolution and image modification tend to use the same model and training method, we want examples to showcase only one task to keep them as readable and easy-to-understand as possible.
+- **One-purpose-only**: Examples should show one task and one task only. Even if a task is from a modeling point of view very similar, *e.g.* image super-resolution and image modification tend to use the same model and training method, we want examples to showcase only one task to keep them as readable and easy-to-understand as possible.

 We provide **official** examples that cover the most popular tasks of diffusion models.
-*Official* examples are **actively** maintained by the `diffusers` maintainers and we try to rigorously follow our example philosophy as defined above. 
+*Official* examples are **actively** maintained by the `diffusers` maintainers and we try to rigorously follow our example philosophy as defined above.
 If you feel like another important example should exist, we are more than happy to welcome a [Feature Request](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feature_request.md&title=) or directly a [Pull Request](https://github.com/huggingface/diffusers/compare) from you!

 Training examples show how to pretrain or fine-tune diffusion models for a variety of tasks. Currently we support:
@@ -39,7 +37,7 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 | Task | 🤗 Accelerate | 🤗 Datasets | Colab
 |---|---|:---:|:---:|
 | [**Unconditional Image Generation**](./unconditional_image_generation) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [**Text-to-Image fine-tuning**](./text_to_image) | ✅ | ✅ | 
+| [**Text-to-Image fine-tuning**](./text_to_image) | ✅ | ✅ |
 | [**Textual Inversion**](./textual_inversion) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
 | [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
 | [**ControlNet**](./controlnet) | ✅ | ✅ | -
--- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
+++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
--- a/examples/amused/README.md
+++ b/examples/amused/README.md
@@ -0,0 +1,326 @@
+## Amused training
+
+Amused can be finetuned on simple datasets relatively cheaply and quickly. Using 8bit optimizers, lora, and gradient accumulation, amused can be finetuned with as little as 5.5 GB. Here are a set of examples for finetuning amused on some relatively simple datasets. These training recipies are aggressively oriented towards minimal resources and fast verification -- i.e. the batch sizes are quite low and the learning rates are quite high. For optimal quality, you will probably want to increase the batch sizes and decrease learning rates.
+
+All training examples use fp16 mixed precision and gradient checkpointing. We don't show 8 bit adam + lora as its about the same memory use as just using lora (bitsandbytes uses full precision optimizer states for weights below a minimum size).
+
+### Finetuning the 256 checkpoint
+
+These examples finetune on this [nouns](https://huggingface.co/datasets/m1guelpf/nouns) dataset.
+
+Example results:
+
+![noun1](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/noun1.png) ![noun2](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/noun2.png) ![noun3](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/noun3.png)
+
+
+#### Full finetuning
+
+Batch size: 8, Learning rate: 1e-4, Gives decent results in 750-1000 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      19.7 GB       |
+|    4        |          2                   |     8             |      18.3 GB       |
+|    1        |          8                   |     8             |      17.9 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 1e-4 \
+    --pretrained_model_name_or_path amused/amused-256 \
+    --instance_data_dataset  'm1guelpf/nouns' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 256 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'a pixel art character with square red glasses, a baseball-shaped head and a orange-colored body on a dark background' \
+        'a pixel art character with square orange glasses, a lips-shaped head and a red-colored body on a light background' \
+        'a pixel art character with square blue glasses, a microwave-shaped head and a purple-colored body on a sunny background' \
+        'a pixel art character with square red glasses, a baseball-shaped head and a blue-colored body on an orange background' \
+        'a pixel art character with square red glasses' \
+        'a pixel art character' \
+        'square red glasses on a pixel art character' \
+        'square red glasses on a pixel art character with a baseball-shaped head' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+#### Full finetuning + 8 bit adam
+
+Note that this training config keeps the batch size low and the learning rate high to get results fast with low resources. However, due to 8 bit adam, it will diverge eventually. If you want to train for longer, you will have to up the batch size and lower the learning rate.
+
+Batch size: 16, Learning rate: 2e-5, Gives decent results in ~750 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    16        |          1                   |     16             |      20.1 GB       |
+|    8        |          2                   |      16           |      15.6 GB       |
+|    1        |          16                   |     16            |      10.7 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 2e-5 \
+    --use_8bit_adam \
+    --pretrained_model_name_or_path amused/amused-256 \
+    --instance_data_dataset  'm1guelpf/nouns' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 256 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'a pixel art character with square red glasses, a baseball-shaped head and a orange-colored body on a dark background' \
+        'a pixel art character with square orange glasses, a lips-shaped head and a red-colored body on a light background' \
+        'a pixel art character with square blue glasses, a microwave-shaped head and a purple-colored body on a sunny background' \
+        'a pixel art character with square red glasses, a baseball-shaped head and a blue-colored body on an orange background' \
+        'a pixel art character with square red glasses' \
+        'a pixel art character' \
+        'square red glasses on a pixel art character' \
+        'square red glasses on a pixel art character with a baseball-shaped head' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+#### Full finetuning + lora
+
+Batch size: 16, Learning rate: 8e-4, Gives decent results in 1000-1250 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    16        |          1                   |     16             |      14.1 GB       |
+|    8        |          2                   |      16           |      10.1 GB       |
+|    1        |          16                   |     16            |      6.5 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 8e-4 \
+    --use_lora \
+    --pretrained_model_name_or_path amused/amused-256 \
+    --instance_data_dataset  'm1guelpf/nouns' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 256 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'a pixel art character with square red glasses, a baseball-shaped head and a orange-colored body on a dark background' \
+        'a pixel art character with square orange glasses, a lips-shaped head and a red-colored body on a light background' \
+        'a pixel art character with square blue glasses, a microwave-shaped head and a purple-colored body on a sunny background' \
+        'a pixel art character with square red glasses, a baseball-shaped head and a blue-colored body on an orange background' \
+        'a pixel art character with square red glasses' \
+        'a pixel art character' \
+        'square red glasses on a pixel art character' \
+        'square red glasses on a pixel art character with a baseball-shaped head' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+### Finetuning the 512 checkpoint
+
+These examples finetune on this [minecraft](https://huggingface.co/monadical-labs/minecraft-preview) dataset.
+
+Example results:
+
+![minecraft1](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/minecraft1.png) ![minecraft2](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/minecraft2.png) ![minecraft3](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/minecraft3.png)
+
+#### Full finetuning
+
+Batch size: 8, Learning rate: 8e-5, Gives decent results in 500-1000 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      24.2 GB       |
+|    4        |          2                   |     8             |      19.7 GB       |
+|    1        |          8                   |     8             |      16.99 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 8e-5 \
+    --pretrained_model_name_or_path amused/amused-512 \
+    --instance_data_dataset  'monadical-labs/minecraft-preview' \
+    --prompt_prefix 'minecraft ' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 512 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'minecraft Avatar' \
+        'minecraft character' \
+        'minecraft' \
+        'minecraft president' \
+        'minecraft pig' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+#### Full finetuning + 8 bit adam
+
+Batch size: 8, Learning rate: 5e-6, Gives decent results in 500-1000 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      21.2 GB       |
+|    4        |          2                   |     8             |      13.3 GB       |
+|    1        |          8                   |     8             |      9.9 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 5e-6 \
+    --pretrained_model_name_or_path amused/amused-512 \
+    --instance_data_dataset  'monadical-labs/minecraft-preview' \
+    --prompt_prefix 'minecraft ' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 512 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'minecraft Avatar' \
+        'minecraft character' \
+        'minecraft' \
+        'minecraft president' \
+        'minecraft pig' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+#### Full finetuning + lora 
+
+Batch size: 8, Learning rate: 1e-4, Gives decent results in 500-1000 steps
+
+| Batch Size | Gradient Accumulation Steps | Effective Total Batch Size | Memory Used |
+|------------|-----------------------------|------------------|-------------|
+|    8        |          1                   |     8             |      12.7 GB       |
+|    4        |          2                   |     8             |      9.0 GB       |
+|    1        |          8                   |     8             |      5.6 GB       |
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --train_batch_size <batch size> \
+    --gradient_accumulation_steps <gradient accumulation steps> \
+    --learning_rate 1e-4 \
+    --use_lora \
+    --pretrained_model_name_or_path amused/amused-512 \
+    --instance_data_dataset  'monadical-labs/minecraft-preview' \
+    --prompt_prefix 'minecraft ' \
+    --image_key image \
+    --prompt_key text \
+    --resolution 512 \
+    --mixed_precision fp16 \
+    --lr_scheduler constant \
+    --validation_prompts \
+        'minecraft Avatar' \
+        'minecraft character' \
+        'minecraft' \
+        'minecraft president' \
+        'minecraft pig' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 250 \
+    --gradient_checkpointing
+```
+
+### Styledrop
+
+[Styledrop](https://arxiv.org/abs/2306.00983) is an efficient finetuning method for learning a new style from just one or very few images. It has an optional first stage to generate human picked additional training samples. The additional training samples can be used to augment the initial images. Our examples exclude the optional additional image selection stage and instead we just finetune on a single image.
+
+This is our example style image:
+![example](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/A%20mushroom%20in%20%5BV%5D%20style.png)
+
+Download it to your local directory with
+```sh
+wget https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/A%20mushroom%20in%20%5BV%5D%20style.png
+```
+
+#### 256
+
+Example results:
+
+![glowing_256_1](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_256_1.png) ![glowing_256_2](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_256_2.png) ![glowing_256_3](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_256_3.png)
+
+Learning rate: 4e-4, Gives decent results in 1500-2000 steps
+
+Memory used: 6.5 GB 
+
+```sh
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --mixed_precision fp16 \
+    --report_to wandb \
+    --use_lora \
+    --pretrained_model_name_or_path amused/amused-256 \
+    --train_batch_size 1 \
+    --lr_scheduler constant \
+    --learning_rate 4e-4 \
+    --validation_prompts \
+        'A chihuahua walking on the street in [V] style' \
+        'A banana on the table in [V] style' \
+        'A church on the street in [V] style' \
+        'A tabby cat walking in the forest in [V] style' \
+    --instance_data_image 'A mushroom in [V] style.png' \
+    --max_train_steps 10000 \
+    --checkpointing_steps 500 \
+    --validation_steps 100 \
+    --resolution 256
+```
+
+#### 512
+
+Example results:
+
+![glowing_512_1](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_512_1.png) ![glowing_512_2](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_512_2.png) ![glowing_512_3](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/amused/glowing_512_3.png)
+
+Learning rate: 1e-3, Lora alpha 1, Gives decent results in 1500-2000 steps
+
+Memory used: 5.6 GB 
+
+```
+accelerate launch train_amused.py \
+    --output_dir <output path> \
+    --mixed_precision fp16 \
+    --report_to wandb \
+    --use_lora \
+    --pretrained_model_name_or_path amused/amused-512 \
+    --train_batch_size 1 \
+    --lr_scheduler constant \
+    --learning_rate 1e-3 \
+    --validation_prompts \
+        'A chihuahua walking on the street in [V] style' \
+        'A banana on the table in [V] style' \
+        'A church on the street in [V] style' \
+        'A tabby cat walking in the forest in [V] style' \
+    --instance_data_image 'A mushroom in [V] style.png' \
+    --max_train_steps 100000 \
+    --checkpointing_steps 500 \
+    --validation_steps 100 \
+    --resolution 512 \
+    --lora_alpha 1
+```
--- a/examples/amused/train_amused.py
+++ b/examples/amused/train_amused.py
@@ -0,0 +1,972 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import logging
+import math
+import os
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import torch
+import torch.nn.functional as F
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import DataLoader, Dataset, default_collate
+from torchvision import transforms
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
+
+import diffusers.optimization
+from diffusers import AmusedPipeline, AmusedScheduler, EMAModel, UVit2DModel, VQModel
+from diffusers.loaders import LoraLoaderMixin
+from diffusers.utils import is_wandb_available
+
+
+if is_wandb_available():
+    import wandb
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--instance_data_dataset",
+        type=str,
+        default=None,
+        required=False,
+        help="A Hugging Face dataset containing the training images",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--instance_data_image", type=str, default=None, required=False, help="A single training image"
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument("--ema_decay", type=float, default=0.9999)
+    parser.add_argument("--ema_update_after_step", type=int, default=0)
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="muse_training",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=50,
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more details"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=0.0003,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="wandb",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--validation_prompts", type=str, nargs="*")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument("--split_vae_encode", type=int, required=False, default=None)
+    parser.add_argument("--min_masking_rate", type=float, default=0.0)
+    parser.add_argument("--cond_dropout_prob", type=float, default=0.0)
+    parser.add_argument("--max_grad_norm", default=None, type=float, help="Max gradient norm.", required=False)
+    parser.add_argument("--use_lora", action="store_true", help="Fine tune the model using LoRa")
+    parser.add_argument("--text_encoder_use_lora", action="store_true", help="Fine tune the model using LoRa")
+    parser.add_argument("--lora_r", default=16, type=int)
+    parser.add_argument("--lora_alpha", default=32, type=int)
+    parser.add_argument("--lora_target_modules", default=["to_q", "to_k", "to_v"], type=str, nargs="+")
+    parser.add_argument("--text_encoder_lora_r", default=16, type=int)
+    parser.add_argument("--text_encoder_lora_alpha", default=32, type=int)
+    parser.add_argument("--text_encoder_lora_target_modules", default=["to_q", "to_k", "to_v"], type=str, nargs="+")
+    parser.add_argument("--train_text_encoder", action="store_true")
+    parser.add_argument("--image_key", type=str, required=False)
+    parser.add_argument("--prompt_key", type=str, required=False)
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument("--prompt_prefix", type=str, required=False, default=None)
+
+    args = parser.parse_args()
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    num_datasources = sum(
+        [x is not None for x in [args.instance_data_dir, args.instance_data_image, args.instance_data_dataset]]
+    )
+
+    if num_datasources != 1:
+        raise ValueError(
+            "provide one and only one of `--instance_data_dir`, `--instance_data_image`, or `--instance_data_dataset`"
+        )
+
+    if args.instance_data_dir is not None:
+        if not os.path.exists(args.instance_data_dir):
+            raise ValueError(f"Does not exist: `--args.instance_data_dir` {args.instance_data_dir}")
+
+    if args.instance_data_image is not None:
+        if not os.path.exists(args.instance_data_image):
+            raise ValueError(f"Does not exist: `--args.instance_data_image` {args.instance_data_image}")
+
+    if args.instance_data_dataset is not None and (args.image_key is None or args.prompt_key is None):
+        raise ValueError("`--instance_data_dataset` requires setting `--image_key` and `--prompt_key`")
+
+    return args
+
+
+class InstanceDataRootDataset(Dataset):
+    def __init__(
+        self,
+        instance_data_root,
+        tokenizer,
+        size=512,
+    ):
+        self.size = size
+        self.tokenizer = tokenizer
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+
+    def __len__(self):
+        return len(self.instance_images_path)
+
+    def __getitem__(self, index):
+        image_path = self.instance_images_path[index % len(self.instance_images_path)]
+        instance_image = Image.open(image_path)
+        rv = process_image(instance_image, self.size)
+
+        prompt = os.path.splitext(os.path.basename(image_path))[0]
+        rv["prompt_input_ids"] = tokenize_prompt(self.tokenizer, prompt)[0]
+        return rv
+
+
+class InstanceDataImageDataset(Dataset):
+    def __init__(
+        self,
+        instance_data_image,
+        train_batch_size,
+        size=512,
+    ):
+        self.value = process_image(Image.open(instance_data_image), size)
+        self.train_batch_size = train_batch_size
+
+    def __len__(self):
+        # Needed so a full batch of the data can be returned. Otherwise will return
+        # batches of size 1
+        return self.train_batch_size
+
+    def __getitem__(self, index):
+        return self.value
+
+
+class HuggingFaceDataset(Dataset):
+    def __init__(
+        self,
+        hf_dataset,
+        tokenizer,
+        image_key,
+        prompt_key,
+        prompt_prefix=None,
+        size=512,
+    ):
+        self.size = size
+        self.image_key = image_key
+        self.prompt_key = prompt_key
+        self.tokenizer = tokenizer
+        self.hf_dataset = hf_dataset
+        self.prompt_prefix = prompt_prefix
+
+    def __len__(self):
+        return len(self.hf_dataset)
+
+    def __getitem__(self, index):
+        item = self.hf_dataset[index]
+
+        rv = process_image(item[self.image_key], self.size)
+
+        prompt = item[self.prompt_key]
+
+        if self.prompt_prefix is not None:
+            prompt = self.prompt_prefix + prompt
+
+        rv["prompt_input_ids"] = tokenize_prompt(self.tokenizer, prompt)[0]
+
+        return rv
+
+
+def process_image(image, size):
+    image = exif_transpose(image)
+
+    if not image.mode == "RGB":
+        image = image.convert("RGB")
+
+    orig_height = image.height
+    orig_width = image.width
+
+    image = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)(image)
+
+    c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(size, size))
+    image = transforms.functional.crop(image, c_top, c_left, size, size)
+
+    image = transforms.ToTensor()(image)
+
+    micro_conds = torch.tensor(
+        [orig_width, orig_height, c_top, c_left, 6.0],
+    )
+
+    return {"image": image, "micro_conds": micro_conds}
+
+
+def tokenize_prompt(tokenizer, prompt):
+    return tokenizer(
+        prompt,
+        truncation=True,
+        padding="max_length",
+        max_length=77,
+        return_tensors="pt",
+    ).input_ids
+
+
+def encode_prompt(text_encoder, input_ids):
+    outputs = text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+    encoder_hidden_states = outputs.hidden_states[-2]
+    cond_embeds = outputs[0]
+    return encoder_hidden_states, cond_embeds
+
+
+def main(args):
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    if accelerator.is_main_process:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+
+    if accelerator.is_main_process:
+        accelerator.init_trackers("amused", config=vars(copy.deepcopy(args)))
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # TODO - will have to fix loading if training text encoder
+    text_encoder = CLIPTextModelWithProjection.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, variant=args.variant
+    )
+    vq_model = VQModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vqvae", revision=args.revision, variant=args.variant
+    )
+
+    if args.train_text_encoder:
+        if args.text_encoder_use_lora:
+            lora_config = LoraConfig(
+                r=args.text_encoder_lora_r,
+                lora_alpha=args.text_encoder_lora_alpha,
+                target_modules=args.text_encoder_lora_target_modules,
+            )
+            text_encoder.add_adapter(lora_config)
+        text_encoder.train()
+        text_encoder.requires_grad_(True)
+    else:
+        text_encoder.eval()
+        text_encoder.requires_grad_(False)
+
+    vq_model.requires_grad_(False)
+
+    model = UVit2DModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="transformer",
+        revision=args.revision,
+        variant=args.variant,
+    )
+
+    if args.use_lora:
+        lora_config = LoraConfig(
+            r=args.lora_r,
+            lora_alpha=args.lora_alpha,
+            target_modules=args.lora_target_modules,
+        )
+        model.add_adapter(lora_config)
+
+    model.train()
+
+    if args.gradient_checkpointing:
+        model.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+
+    if args.use_ema:
+        ema = EMAModel(
+            model.parameters(),
+            decay=args.ema_decay,
+            update_after_step=args.ema_update_after_step,
+            model_cls=UVit2DModel,
+            model_config=model.config,
+        )
+
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            transformer_lora_layers_to_save = None
+            text_encoder_lora_layers_to_save = None
+
+            for model_ in models:
+                if isinstance(model_, type(accelerator.unwrap_model(model))):
+                    if args.use_lora:
+                        transformer_lora_layers_to_save = get_peft_model_state_dict(model_)
+                    else:
+                        model_.save_pretrained(os.path.join(output_dir, "transformer"))
+                elif isinstance(model_, type(accelerator.unwrap_model(text_encoder))):
+                    if args.text_encoder_use_lora:
+                        text_encoder_lora_layers_to_save = get_peft_model_state_dict(model_)
+                    else:
+                        model_.save_pretrained(os.path.join(output_dir, "text_encoder"))
+                else:
+                    raise ValueError(f"unexpected save model: {model_.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            if transformer_lora_layers_to_save is not None or text_encoder_lora_layers_to_save is not None:
+                LoraLoaderMixin.save_lora_weights(
+                    output_dir,
+                    transformer_lora_layers=transformer_lora_layers_to_save,
+                    text_encoder_lora_layers=text_encoder_lora_layers_to_save,
+                )
+
+            if args.use_ema:
+                ema.save_pretrained(os.path.join(output_dir, "ema_model"))
+
+    def load_model_hook(models, input_dir):
+        transformer = None
+        text_encoder_ = None
+
+        while len(models) > 0:
+            model_ = models.pop()
+
+            if isinstance(model_, type(accelerator.unwrap_model(model))):
+                if args.use_lora:
+                    transformer = model_
+                else:
+                    load_model = UVit2DModel.from_pretrained(os.path.join(input_dir, "transformer"))
+                    model_.load_state_dict(load_model.state_dict())
+                    del load_model
+            elif isinstance(model, type(accelerator.unwrap_model(text_encoder))):
+                if args.text_encoder_use_lora:
+                    text_encoder_ = model_
+                else:
+                    load_model = CLIPTextModelWithProjection.from_pretrained(os.path.join(input_dir, "text_encoder"))
+                    model_.load_state_dict(load_model.state_dict())
+                    del load_model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        if transformer is not None or text_encoder_ is not None:
+            lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir)
+            LoraLoaderMixin.load_lora_into_text_encoder(
+                lora_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_
+            )
+            LoraLoaderMixin.load_lora_into_transformer(
+                lora_state_dict, network_alphas=network_alphas, transformer=transformer
+            )
+
+        if args.use_ema:
+            load_from = EMAModel.from_pretrained(os.path.join(input_dir, "ema_model"), model_cls=UVit2DModel)
+            ema.load_state_dict(load_from.state_dict())
+            del load_from
+
+    accelerator.register_load_state_pre_hook(load_model_hook)
+    accelerator.register_save_state_pre_hook(save_model_hook)
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+        )
+
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    # no decay on bias and layernorm and embedding
+    no_decay = ["bias", "layer_norm.weight", "mlm_ln.weight", "embeddings.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.adam_weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    if args.train_text_encoder:
+        optimizer_grouped_parameters.append(
+            {"params": text_encoder.parameters(), "weight_decay": args.adam_weight_decay}
+        )
+
+    optimizer = optimizer_cls(
+        optimizer_grouped_parameters,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    logger.info("Creating dataloaders and lr_scheduler")
+
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    if args.instance_data_dir is not None:
+        dataset = InstanceDataRootDataset(
+            instance_data_root=args.instance_data_dir,
+            tokenizer=tokenizer,
+            size=args.resolution,
+        )
+    elif args.instance_data_image is not None:
+        dataset = InstanceDataImageDataset(
+            instance_data_image=args.instance_data_image,
+            train_batch_size=args.train_batch_size,
+            size=args.resolution,
+        )
+    elif args.instance_data_dataset is not None:
+        dataset = HuggingFaceDataset(
+            hf_dataset=load_dataset(args.instance_data_dataset, split="train"),
+            tokenizer=tokenizer,
+            image_key=args.image_key,
+            prompt_key=args.prompt_key,
+            prompt_prefix=args.prompt_prefix,
+            size=args.resolution,
+        )
+    else:
+        assert False
+
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        num_workers=args.dataloader_num_workers,
+        collate_fn=default_collate,
+    )
+    train_dataloader.num_batches = len(train_dataloader)
+
+    lr_scheduler = diffusers.optimization.get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+    )
+
+    logger.info("Preparing model, optimizer and dataloaders")
+
+    if args.train_text_encoder:
+        model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
+            model, optimizer, lr_scheduler, train_dataloader, text_encoder
+        )
+    else:
+        model, optimizer, lr_scheduler, train_dataloader = accelerator.prepare(
+            model, optimizer, lr_scheduler, train_dataloader
+        )
+
+    train_dataloader.num_batches = len(train_dataloader)
+
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    if not args.train_text_encoder:
+        text_encoder.to(device=accelerator.device, dtype=weight_dtype)
+
+    vq_model.to(device=accelerator.device)
+
+    if args.use_ema:
+        ema.to(accelerator.device)
+
+    with nullcontext() if args.train_text_encoder else torch.no_grad():
+        empty_embeds, empty_clip_embeds = encode_prompt(
+            text_encoder, tokenize_prompt(tokenizer, "").to(text_encoder.device, non_blocking=True)
+        )
+
+        # There is a single image, we can just pre-encode the single prompt
+        if args.instance_data_image is not None:
+            prompt = os.path.splitext(os.path.basename(args.instance_data_image))[0]
+            encoder_hidden_states, cond_embeds = encode_prompt(
+                text_encoder, tokenize_prompt(tokenizer, prompt).to(text_encoder.device, non_blocking=True)
+            )
+            encoder_hidden_states = encoder_hidden_states.repeat(args.train_batch_size, 1, 1)
+            cond_embeds = cond_embeds.repeat(args.train_batch_size, 1)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
+    # Afterwards we recalculate our number of training epochs.
+    # Note: We are not doing epoch based training here, but just using this for book keeping and being able to
+    # reuse the same training loop with other datasets/loaders.
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info(f"  Num training steps = {args.max_train_steps}")
+    logger.info(f"  Instantaneous batch size per device = { args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+
+    resume_from_checkpoint = args.resume_from_checkpoint
+    if resume_from_checkpoint:
+        if resume_from_checkpoint == "latest":
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            if len(dirs) > 0:
+                resume_from_checkpoint = os.path.join(args.output_dir, dirs[-1])
+            else:
+                resume_from_checkpoint = None
+
+        if resume_from_checkpoint is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+        else:
+            accelerator.print(f"Resuming from checkpoint {resume_from_checkpoint}")
+
+    if resume_from_checkpoint is None:
+        global_step = 0
+        first_epoch = 0
+    else:
+        accelerator.load_state(resume_from_checkpoint)
+        global_step = int(os.path.basename(resume_from_checkpoint).split("-")[1])
+        first_epoch = global_step // num_update_steps_per_epoch
+
+    # As stated above, we are not doing epoch based training here, but just using this for book keeping and being able to
+    # reuse the same training loop with other datasets/loaders.
+    for epoch in range(first_epoch, num_train_epochs):
+        for batch in train_dataloader:
+            with torch.no_grad():
+                micro_conds = batch["micro_conds"].to(accelerator.device, non_blocking=True)
+                pixel_values = batch["image"].to(accelerator.device, non_blocking=True)
+
+                batch_size = pixel_values.shape[0]
+
+                split_batch_size = args.split_vae_encode if args.split_vae_encode is not None else batch_size
+                num_splits = math.ceil(batch_size / split_batch_size)
+                image_tokens = []
+                for i in range(num_splits):
+                    start_idx = i * split_batch_size
+                    end_idx = min((i + 1) * split_batch_size, batch_size)
+                    bs = pixel_values.shape[0]
+                    image_tokens.append(
+                        vq_model.quantize(vq_model.encode(pixel_values[start_idx:end_idx]).latents)[2][2].reshape(
+                            bs, -1
+                        )
+                    )
+                image_tokens = torch.cat(image_tokens, dim=0)
+
+                batch_size, seq_len = image_tokens.shape
+
+                timesteps = torch.rand(batch_size, device=image_tokens.device)
+                mask_prob = torch.cos(timesteps * math.pi * 0.5)
+                mask_prob = mask_prob.clip(args.min_masking_rate)
+
+                num_token_masked = (seq_len * mask_prob).round().clamp(min=1)
+                batch_randperm = torch.rand(batch_size, seq_len, device=image_tokens.device).argsort(dim=-1)
+                mask = batch_randperm < num_token_masked.unsqueeze(-1)
+
+                mask_id = accelerator.unwrap_model(model).config.vocab_size - 1
+                input_ids = torch.where(mask, mask_id, image_tokens)
+                labels = torch.where(mask, image_tokens, -100)
+
+                if args.cond_dropout_prob > 0.0:
+                    assert encoder_hidden_states is not None
+
+                    batch_size = encoder_hidden_states.shape[0]
+
+                    mask = (
+                        torch.zeros((batch_size, 1, 1), device=encoder_hidden_states.device).float().uniform_(0, 1)
+                        < args.cond_dropout_prob
+                    )
+
+                    empty_embeds_ = empty_embeds.expand(batch_size, -1, -1)
+                    encoder_hidden_states = torch.where(
+                        (encoder_hidden_states * mask).bool(), encoder_hidden_states, empty_embeds_
+                    )
+
+                    empty_clip_embeds_ = empty_clip_embeds.expand(batch_size, -1)
+                    cond_embeds = torch.where((cond_embeds * mask.squeeze(-1)).bool(), cond_embeds, empty_clip_embeds_)
+
+                bs = input_ids.shape[0]
+                vae_scale_factor = 2 ** (len(vq_model.config.block_out_channels) - 1)
+                resolution = args.resolution // vae_scale_factor
+                input_ids = input_ids.reshape(bs, resolution, resolution)
+
+            if "prompt_input_ids" in batch:
+                with nullcontext() if args.train_text_encoder else torch.no_grad():
+                    encoder_hidden_states, cond_embeds = encode_prompt(
+                        text_encoder, batch["prompt_input_ids"].to(accelerator.device, non_blocking=True)
+                    )
+
+            # Train Step
+            with accelerator.accumulate(model):
+                codebook_size = accelerator.unwrap_model(model).config.codebook_size
+
+                logits = (
+                    model(
+                        input_ids=input_ids,
+                        encoder_hidden_states=encoder_hidden_states,
+                        micro_conds=micro_conds,
+                        pooled_text_emb=cond_embeds,
+                    )
+                    .reshape(bs, codebook_size, -1)
+                    .permute(0, 2, 1)
+                    .reshape(-1, codebook_size)
+                )
+
+                loss = F.cross_entropy(
+                    logits,
+                    labels.view(-1),
+                    ignore_index=-100,
+                    reduction="mean",
+                )
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                avg_masking_rate = accelerator.gather(mask_prob.repeat(args.train_batch_size)).mean()
+
+                accelerator.backward(loss)
+
+                if args.max_grad_norm is not None and accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+
+                optimizer.zero_grad(set_to_none=True)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema.step(model.parameters())
+
+                if (global_step + 1) % args.logging_steps == 0:
+                    logs = {
+                        "step_loss": avg_loss.item(),
+                        "lr": lr_scheduler.get_last_lr()[0],
+                        "avg_masking_rate": avg_masking_rate.item(),
+                    }
+                    accelerator.log(logs, step=global_step + 1)
+
+                    logger.info(
+                        f"Step: {global_step + 1} "
+                        f"Loss: {avg_loss.item():0.4f} "
+                        f"LR: {lr_scheduler.get_last_lr()[0]:0.6f}"
+                    )
+
+                if (global_step + 1) % args.checkpointing_steps == 0:
+                    save_checkpoint(args, accelerator, global_step + 1)
+
+                if (global_step + 1) % args.validation_steps == 0 and accelerator.is_main_process:
+                    if args.use_ema:
+                        ema.store(model.parameters())
+                        ema.copy_to(model.parameters())
+
+                    with torch.no_grad():
+                        logger.info("Generating images...")
+
+                        model.eval()
+
+                        if args.train_text_encoder:
+                            text_encoder.eval()
+
+                        scheduler = AmusedScheduler.from_pretrained(
+                            args.pretrained_model_name_or_path,
+                            subfolder="scheduler",
+                            revision=args.revision,
+                            variant=args.variant,
+                        )
+
+                        pipe = AmusedPipeline(
+                            transformer=accelerator.unwrap_model(model),
+                            tokenizer=tokenizer,
+                            text_encoder=text_encoder,
+                            vqvae=vq_model,
+                            scheduler=scheduler,
+                        )
+
+                        pil_images = pipe(prompt=args.validation_prompts).images
+                        wandb_images = [
+                            wandb.Image(image, caption=args.validation_prompts[i])
+                            for i, image in enumerate(pil_images)
+                        ]
+
+                        wandb.log({"generated_images": wandb_images}, step=global_step + 1)
+
+                        model.train()
+
+                        if args.train_text_encoder:
+                            text_encoder.train()
+
+                    if args.use_ema:
+                        ema.restore(model.parameters())
+
+                global_step += 1
+
+            # Stop training if max steps is reached
+            if global_step >= args.max_train_steps:
+                break
+        # End for
+
+    accelerator.wait_for_everyone()
+
+    # Evaluate and save checkpoint at the end of training
+    save_checkpoint(args, accelerator, global_step)
+
+    # Save the final trained checkpoint
+    if accelerator.is_main_process:
+        model = accelerator.unwrap_model(model)
+        if args.use_ema:
+            ema.copy_to(model.parameters())
+        model.save_pretrained(args.output_dir)
+
+    accelerator.end_training()
+
+
+def save_checkpoint(args, accelerator, global_step):
+    output_dir = args.output_dir
+
+    # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+    if accelerator.is_main_process and args.checkpoints_total_limit is not None:
+        checkpoints = os.listdir(output_dir)
+        checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+        checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+        # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+        if len(checkpoints) >= args.checkpoints_total_limit:
+            num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+            removing_checkpoints = checkpoints[0:num_to_remove]
+
+            logger.info(
+                f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+            )
+            logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+            for removing_checkpoint in removing_checkpoints:
+                removing_checkpoint = os.path.join(output_dir, removing_checkpoint)
+                shutil.rmtree(removing_checkpoint)
+
+    save_path = Path(output_dir) / f"checkpoint-{global_step}"
+    accelerator.save_state(save_path)
+    logger.info(f"Saved state to {save_path}")
+
+
+if __name__ == "__main__":
+    main(parse_args())
--- a/examples/community/README.md
+++ b/examples/community/README.md
--- a/examples/community/checkpoint_merger.py
+++ b/examples/community/checkpoint_merger.py
@@ -5,10 +5,11 @@ from typing import Dict, List, Union
 import safetensors.torch
 import torch
 from huggingface_hub import snapshot_download
+from huggingface_hub.utils import validate_hf_hub_args

 from diffusers import DiffusionPipeline, __version__
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from diffusers.utils import CONFIG_NAME, DIFFUSERS_CACHE, ONNX_WEIGHTS_NAME, WEIGHTS_NAME
+from diffusers.utils import CONFIG_NAME, ONNX_WEIGHTS_NAME, WEIGHTS_NAME


 class CheckpointMergerPipeline(DiffusionPipeline):
@@ -57,6 +58,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
        return (temp_dict, meta_keys)

    @torch.no_grad()
+    @validate_hf_hub_args
    def merge(self, pretrained_model_name_or_path_list: List[Union[str, os.PathLike]], **kwargs):
        """
        Returns a new pipeline object of the class 'DiffusionPipeline' with the merged checkpoints(weights) of the models passed
@@ -69,7 +71,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
            **kwargs:
                Supports all the default DiffusionPipeline.get_config_dict kwargs viz..

-                cache_dir, resume_download, force_download, proxies, local_files_only, use_auth_token, revision, torch_dtype, device_map.
+                cache_dir, resume_download, force_download, proxies, local_files_only, token, revision, torch_dtype, device_map.

                alpha - The interpolation parameter. Ranges from 0 to 1.  It affects the ratio in which the checkpoints are merged. A 0.8 alpha
                    would mean that the first model checkpoints would affect the final result far less than an alpha of 0.2
@@ -81,12 +83,12 @@ class CheckpointMergerPipeline(DiffusionPipeline):

        """
        # Default kwargs from DiffusionPipeline
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        cache_dir = kwargs.pop("cache_dir", None)
        resume_download = kwargs.pop("resume_download", False)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
-        use_auth_token = kwargs.pop("use_auth_token", None)
+        token = kwargs.pop("token", None)
        revision = kwargs.pop("revision", None)
        torch_dtype = kwargs.pop("torch_dtype", None)
        device_map = kwargs.pop("device_map", None)
@@ -123,7 +125,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
                force_download=force_download,
                proxies=proxies,
                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                revision=revision,
            )
            config_dicts.append(config_dict)
@@ -159,7 +161,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
                    resume_download=resume_download,
                    proxies=proxies,
                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                    revision=revision,
                    allow_patterns=allow_patterns,
                    user_agent=user_agent,
--- a/examples/community/dps_pipeline.py
+++ b/examples/community/dps_pipeline.py
@@ -0,0 +1,466 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from math import pi
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import DDPMScheduler, DiffusionPipeline, ImagePipelineOutput, UNet2DModel
+from diffusers.utils.torch_utils import randn_tensor
+
+
+class DPSPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for Diffusion Posterior Sampling.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        measurement: torch.Tensor,
+        operator: torch.nn.Module,
+        loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = 1000,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        zeta: float = 0.3,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            measurement (`torch.Tensor`, *required*):
+                A 'torch.Tensor', the corrupted image
+            operator (`torch.nn.Module`, *required*):
+                A 'torch.nn.Module', the operator generating the corrupted image
+            loss_fn (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *required*):
+                A 'Callable[[torch.Tensor, torch.Tensor], torch.Tensor]', the loss function used
+                between the measurements, for most of the cases using RMSE is fine.
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 1000):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import DDPMPipeline
+
+        >>> # load model and scheduler
+        >>> pipe = DDPMPipeline.from_pretrained("google/ddpm-cat-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe().images[0]
+
+        >>> # save image
+        >>> image.save("ddpm_generated_image.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        # Sample gaussian noise to begin loop
+        if isinstance(self.unet.config.sample_size, int):
+            image_shape = (
+                batch_size,
+                self.unet.config.in_channels,
+                self.unet.config.sample_size,
+                self.unet.config.sample_size,
+            )
+        else:
+            image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
+
+        if self.device.type == "mps":
+            # randn does not work reproducibly on mps
+            image = randn_tensor(image_shape, generator=generator)
+            image = image.to(self.device)
+        else:
+            image = randn_tensor(image_shape, generator=generator, device=self.device)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            with torch.enable_grad():
+                # 1. predict noise model_output
+                image = image.requires_grad_()
+                model_output = self.unet(image, t).sample
+
+                # 2. compute previous image x'_{t-1} and original prediction x0_{t}
+                scheduler_out = self.scheduler.step(model_output, t, image, generator=generator)
+                image_pred, origi_pred = scheduler_out.prev_sample, scheduler_out.pred_original_sample
+
+                # 3. compute y'_t = f(x0_{t})
+                measurement_pred = operator(origi_pred)
+
+                # 4. compute loss = d(y, y'_t-1)
+                loss = loss_fn(measurement, measurement_pred)
+                loss.backward()
+
+                print("distance: {0:.4f}".format(loss.item()))
+
+                with torch.no_grad():
+                    image_pred = image_pred - zeta * image.grad
+                    image = image_pred.detach()
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
+
+if __name__ == "__main__":
+    import scipy
+    from torch import nn
+    from torchvision.utils import save_image
+
+    # defining the operators f(.) of y = f(x)
+    # super-resolution operator
+    class SuperResolutionOperator(nn.Module):
+        def __init__(self, in_shape, scale_factor):
+            super().__init__()
+
+            # Resizer local class, do not use outiside the SR operator class
+            class Resizer(nn.Module):
+                def __init__(self, in_shape, scale_factor=None, output_shape=None, kernel=None, antialiasing=True):
+                    super(Resizer, self).__init__()
+
+                    # First standardize values and fill missing arguments (if needed) by deriving scale from output shape or vice versa
+                    scale_factor, output_shape = self.fix_scale_and_size(in_shape, output_shape, scale_factor)
+
+                    # Choose interpolation method, each method has the matching kernel size
+                    def cubic(x):
+                        absx = np.abs(x)
+                        absx2 = absx**2
+                        absx3 = absx**3
+                        return (1.5 * absx3 - 2.5 * absx2 + 1) * (absx <= 1) + (
+                            -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2
+                        ) * ((1 < absx) & (absx <= 2))
+
+                    def lanczos2(x):
+                        return (
+                            (np.sin(pi * x) * np.sin(pi * x / 2) + np.finfo(np.float32).eps)
+                            / ((pi**2 * x**2 / 2) + np.finfo(np.float32).eps)
+                        ) * (abs(x) < 2)
+
+                    def box(x):
+                        return ((-0.5 <= x) & (x < 0.5)) * 1.0
+
+                    def lanczos3(x):
+                        return (
+                            (np.sin(pi * x) * np.sin(pi * x / 3) + np.finfo(np.float32).eps)
+                            / ((pi**2 * x**2 / 3) + np.finfo(np.float32).eps)
+                        ) * (abs(x) < 3)
+
+                    def linear(x):
+                        return (x + 1) * ((-1 <= x) & (x < 0)) + (1 - x) * ((0 <= x) & (x <= 1))
+
+                    method, kernel_width = {
+                        "cubic": (cubic, 4.0),
+                        "lanczos2": (lanczos2, 4.0),
+                        "lanczos3": (lanczos3, 6.0),
+                        "box": (box, 1.0),
+                        "linear": (linear, 2.0),
+                        None: (cubic, 4.0),  # set default interpolation method as cubic
+                    }.get(kernel)
+
+                    # Antialiasing is only used when downscaling
+                    antialiasing *= np.any(np.array(scale_factor) < 1)
+
+                    # Sort indices of dimensions according to scale of each dimension. since we are going dim by dim this is efficient
+                    sorted_dims = np.argsort(np.array(scale_factor))
+                    self.sorted_dims = [int(dim) for dim in sorted_dims if scale_factor[dim] != 1]
+
+                    # Iterate over dimensions to calculate local weights for resizing and resize each time in one direction
+                    field_of_view_list = []
+                    weights_list = []
+                    for dim in self.sorted_dims:
+                        # for each coordinate (along 1 dim), calculate which coordinates in the input image affect its result and the
+                        # weights that multiply the values there to get its result.
+                        weights, field_of_view = self.contributions(
+                            in_shape[dim], output_shape[dim], scale_factor[dim], method, kernel_width, antialiasing
+                        )
+
+                        # convert to torch tensor
+                        weights = torch.tensor(weights.T, dtype=torch.float32)
+
+                        # We add singleton dimensions to the weight matrix so we can multiply it with the big tensor we get for
+                        # tmp_im[field_of_view.T], (bsxfun style)
+                        weights_list.append(
+                            nn.Parameter(
+                                torch.reshape(weights, list(weights.shape) + (len(scale_factor) - 1) * [1]),
+                                requires_grad=False,
+                            )
+                        )
+                        field_of_view_list.append(
+                            nn.Parameter(
+                                torch.tensor(field_of_view.T.astype(np.int32), dtype=torch.long), requires_grad=False
+                            )
+                        )
+
+                    self.field_of_view = nn.ParameterList(field_of_view_list)
+                    self.weights = nn.ParameterList(weights_list)
+
+                def forward(self, in_tensor):
+                    x = in_tensor
+
+                    # Use the affecting position values and the set of weights to calculate the result of resizing along this 1 dim
+                    for dim, fov, w in zip(self.sorted_dims, self.field_of_view, self.weights):
+                        # To be able to act on each dim, we swap so that dim 0 is the wanted dim to resize
+                        x = torch.transpose(x, dim, 0)
+
+                        # This is a bit of a complicated multiplication: x[field_of_view.T] is a tensor of order image_dims+1.
+                        # for each pixel in the output-image it matches the positions the influence it from the input image (along 1 dim
+                        # only, this is why it only adds 1 dim to 5the shape). We then multiply, for each pixel, its set of positions with
+                        # the matching set of weights. we do this by this big tensor element-wise multiplication (MATLAB bsxfun style:
+                        # matching dims are multiplied element-wise while singletons mean that the matching dim is all multiplied by the
+                        # same number
+                        x = torch.sum(x[fov] * w, dim=0)
+
+                        # Finally we swap back the axes to the original order
+                        x = torch.transpose(x, dim, 0)
+
+                    return x
+
+                def fix_scale_and_size(self, input_shape, output_shape, scale_factor):
+                    # First fixing the scale-factor (if given) to be standardized the function expects (a list of scale factors in the
+                    # same size as the number of input dimensions)
+                    if scale_factor is not None:
+                        # By default, if scale-factor is a scalar we assume 2d resizing and duplicate it.
+                        if np.isscalar(scale_factor) and len(input_shape) > 1:
+                            scale_factor = [scale_factor, scale_factor]
+
+                        # We extend the size of scale-factor list to the size of the input by assigning 1 to all the unspecified scales
+                        scale_factor = list(scale_factor)
+                        scale_factor = [1] * (len(input_shape) - len(scale_factor)) + scale_factor
+
+                    # Fixing output-shape (if given): extending it to the size of the input-shape, by assigning the original input-size
+                    # to all the unspecified dimensions
+                    if output_shape is not None:
+                        output_shape = list(input_shape[len(output_shape) :]) + list(np.uint(np.array(output_shape)))
+
+                    # Dealing with the case of non-give scale-factor, calculating according to output-shape. note that this is
+                    # sub-optimal, because there can be different scales to the same output-shape.
+                    if scale_factor is None:
+                        scale_factor = 1.0 * np.array(output_shape) / np.array(input_shape)
+
+                    # Dealing with missing output-shape. calculating according to scale-factor
+                    if output_shape is None:
+                        output_shape = np.uint(np.ceil(np.array(input_shape) * np.array(scale_factor)))
+
+                    return scale_factor, output_shape
+
+                def contributions(self, in_length, out_length, scale, kernel, kernel_width, antialiasing):
+                    # This function calculates a set of 'filters' and a set of field_of_view that will later on be applied
+                    # such that each position from the field_of_view will be multiplied with a matching filter from the
+                    # 'weights' based on the interpolation method and the distance of the sub-pixel location from the pixel centers
+                    # around it. This is only done for one dimension of the image.
+
+                    # When anti-aliasing is activated (default and only for downscaling) the receptive field is stretched to size of
+                    # 1/sf. this means filtering is more 'low-pass filter'.
+                    fixed_kernel = (lambda arg: scale * kernel(scale * arg)) if antialiasing else kernel
+                    kernel_width *= 1.0 / scale if antialiasing else 1.0
+
+                    # These are the coordinates of the output image
+                    out_coordinates = np.arange(1, out_length + 1)
+
+                    # since both scale-factor and output size can be provided simulatneously, perserving the center of the image requires shifting
+                    # the output coordinates. the deviation is because out_length doesn't necesary equal in_length*scale.
+                    # to keep the center we need to subtract half of this deivation so that we get equal margins for boths sides and center is preserved.
+                    shifted_out_coordinates = out_coordinates - (out_length - in_length * scale) / 2
+
+                    # These are the matching positions of the output-coordinates on the input image coordinates.
+                    # Best explained by example: say we have 4 horizontal pixels for HR and we downscale by SF=2 and get 2 pixels:
+                    # [1,2,3,4] -> [1,2]. Remember each pixel number is the middle of the pixel.
+                    # The scaling is done between the distances and not pixel numbers (the right boundary of pixel 4 is transformed to
+                    # the right boundary of pixel 2. pixel 1 in the small image matches the boundary between pixels 1 and 2 in the big
+                    # one and not to pixel 2. This means the position is not just multiplication of the old pos by scale-factor).
+                    # So if we measure distance from the left border, middle of pixel 1 is at distance d=0.5, border between 1 and 2 is
+                    # at d=1, and so on (d = p - 0.5).  we calculate (d_new = d_old / sf) which means:
+                    # (p_new-0.5 = (p_old-0.5) / sf)     ->          p_new = p_old/sf + 0.5 * (1-1/sf)
+                    match_coordinates = shifted_out_coordinates / scale + 0.5 * (1 - 1 / scale)
+
+                    # This is the left boundary to start multiplying the filter from, it depends on the size of the filter
+                    left_boundary = np.floor(match_coordinates - kernel_width / 2)
+
+                    # Kernel width needs to be enlarged because when covering has sub-pixel borders, it must 'see' the pixel centers
+                    # of the pixels it only covered a part from. So we add one pixel at each side to consider (weights can zeroize them)
+                    expanded_kernel_width = np.ceil(kernel_width) + 2
+
+                    # Determine a set of field_of_view for each each output position, these are the pixels in the input image
+                    # that the pixel in the output image 'sees'. We get a matrix whos horizontal dim is the output pixels (big) and the
+                    # vertical dim is the pixels it 'sees' (kernel_size + 2)
+                    field_of_view = np.squeeze(
+                        np.int16(np.expand_dims(left_boundary, axis=1) + np.arange(expanded_kernel_width) - 1)
+                    )
+
+                    # Assign weight to each pixel in the field of view. A matrix whos horizontal dim is the output pixels and the
+                    # vertical dim is a list of weights matching to the pixel in the field of view (that are specified in
+                    # 'field_of_view')
+                    weights = fixed_kernel(1.0 * np.expand_dims(match_coordinates, axis=1) - field_of_view - 1)
+
+                    # Normalize weights to sum up to 1. be careful from dividing by 0
+                    sum_weights = np.sum(weights, axis=1)
+                    sum_weights[sum_weights == 0] = 1.0
+                    weights = 1.0 * weights / np.expand_dims(sum_weights, axis=1)
+
+                    # We use this mirror structure as a trick for reflection padding at the boundaries
+                    mirror = np.uint(np.concatenate((np.arange(in_length), np.arange(in_length - 1, -1, step=-1))))
+                    field_of_view = mirror[np.mod(field_of_view, mirror.shape[0])]
+
+                    # Get rid of  weights and pixel positions that are of zero weight
+                    non_zero_out_pixels = np.nonzero(np.any(weights, axis=0))
+                    weights = np.squeeze(weights[:, non_zero_out_pixels])
+                    field_of_view = np.squeeze(field_of_view[:, non_zero_out_pixels])
+
+                    # Final products are the relative positions and the matching weights, both are output_size X fixed_kernel_size
+                    return weights, field_of_view
+
+            self.down_sample = Resizer(in_shape, 1 / scale_factor)
+            for param in self.parameters():
+                param.requires_grad = False
+
+        def forward(self, data, **kwargs):
+            return self.down_sample(data)
+
+    # Gaussian blurring operator
+    class GaussialBlurOperator(nn.Module):
+        def __init__(self, kernel_size, intensity):
+            super().__init__()
+
+            class Blurkernel(nn.Module):
+                def __init__(self, blur_type="gaussian", kernel_size=31, std=3.0):
+                    super().__init__()
+                    self.blur_type = blur_type
+                    self.kernel_size = kernel_size
+                    self.std = std
+                    self.seq = nn.Sequential(
+                        nn.ReflectionPad2d(self.kernel_size // 2),
+                        nn.Conv2d(3, 3, self.kernel_size, stride=1, padding=0, bias=False, groups=3),
+                    )
+                    self.weights_init()
+
+                def forward(self, x):
+                    return self.seq(x)
+
+                def weights_init(self):
+                    if self.blur_type == "gaussian":
+                        n = np.zeros((self.kernel_size, self.kernel_size))
+                        n[self.kernel_size // 2, self.kernel_size // 2] = 1
+                        k = scipy.ndimage.gaussian_filter(n, sigma=self.std)
+                        k = torch.from_numpy(k)
+                        self.k = k
+                        for name, f in self.named_parameters():
+                            f.data.copy_(k)
+
+                def update_weights(self, k):
+                    if not torch.is_tensor(k):
+                        k = torch.from_numpy(k)
+                    for name, f in self.named_parameters():
+                        f.data.copy_(k)
+
+                def get_kernel(self):
+                    return self.k
+
+            self.kernel_size = kernel_size
+            self.conv = Blurkernel(blur_type="gaussian", kernel_size=kernel_size, std=intensity)
+            self.kernel = self.conv.get_kernel()
+            self.conv.update_weights(self.kernel.type(torch.float32))
+
+            for param in self.parameters():
+                param.requires_grad = False
+
+        def forward(self, data, **kwargs):
+            return self.conv(data)
+
+        def transpose(self, data, **kwargs):
+            return data
+
+        def get_kernel(self):
+            return self.kernel.view(1, 1, self.kernel_size, self.kernel_size)
+
+    # assuming the forward process y = f(x) is polluted by Gaussian noise, use l2 norm
+    def RMSELoss(yhat, y):
+        return torch.sqrt(torch.sum((yhat - y) ** 2))
+
+    # set up source image
+    src = Image.open("sample.png")
+    # read image into [1,3,H,W]
+    src = torch.from_numpy(np.array(src, dtype=np.float32)).permute(2, 0, 1)[None]
+    # normalize image to [-1,1]
+    src = (src / 127.5) - 1.0
+    src = src.to("cuda")
+
+    # set up operator and measurement
+    # operator = SuperResolutionOperator(in_shape=src.shape, scale_factor=4).to("cuda")
+    operator = GaussialBlurOperator(kernel_size=61, intensity=3.0).to("cuda")
+    measurement = operator(src)
+
+    # set up scheduler
+    scheduler = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256")
+    scheduler.set_timesteps(1000)
+
+    # set up model
+    model = UNet2DModel.from_pretrained("google/ddpm-celebahq-256").to("cuda")
+
+    save_image((src + 1.0) / 2.0, "dps_src.png")
+    save_image((measurement + 1.0) / 2.0, "dps_mea.png")
+
+    # finally, the pipeline
+    dpspipe = DPSPipeline(model, scheduler)
+    image = dpspipe(
+        measurement=measurement,
+        operator=operator,
+        loss_fn=RMSELoss,
+        zeta=1.0,
+    ).images[0]
+
+    image.save("dps_generated_image.png")
--- a/examples/community/llm_grounded_diffusion.py
+++ b/examples/community/llm_grounded_diffusion.py
@@ -16,6 +16,7 @@

 import ast
 import gc
+import inspect
 import math
 import warnings
 from collections.abc import Iterable
@@ -23,16 +24,29 @@ from typing import Any, Callable, Dict, List, Optional, Union

 import torch
 import torch.nn.functional as F
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.attention import Attention, GatedSelfAttentionDense
 from diffusers.models.attention_processor import AttnProcessor2_0
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import logging, replace_example_docstring
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor


 EXAMPLE_DOC_STRING = """
@@ -44,6 +58,7 @@ EXAMPLE_DOC_STRING = """
        >>> pipe = DiffusionPipeline.from_pretrained(
        ...     "longlian/lmd_plus",
        ...     custom_pipeline="llm_grounded_diffusion",
+        ...     custom_revision="main",
        ...     variant="fp16", torch_dtype=torch.float16
        ... )
        >>> pipe.enable_model_cpu_offload()
@@ -96,7 +111,12 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

 # All keys in Stable Diffusion models: [('down', 0, 0, 0), ('down', 0, 1, 0), ('down', 1, 0, 0), ('down', 1, 1, 0), ('down', 2, 0, 0), ('down', 2, 1, 0), ('mid', 0, 0, 0), ('up', 1, 0, 0), ('up', 1, 1, 0), ('up', 1, 2, 0), ('up', 2, 0, 0), ('up', 2, 1, 0), ('up', 2, 2, 0), ('up', 3, 0, 0), ('up', 3, 1, 0), ('up', 3, 2, 0)]
 # Note that the first up block is `UpBlock2D` rather than `CrossAttnUpBlock2D` and does not have attention. The last index is always 0 in our case since we have one `BasicTransformerBlock` in each `Transformer2DModel`.
-DEFAULT_GUIDANCE_ATTN_KEYS = [("mid", 0, 0, 0), ("up", 1, 0, 0), ("up", 1, 1, 0), ("up", 1, 2, 0)]
+DEFAULT_GUIDANCE_ATTN_KEYS = [
+    ("mid", 0, 0, 0),
+    ("up", 1, 0, 0),
+    ("up", 1, 1, 0),
+    ("up", 1, 2, 0),
+]


 def convert_attn_keys(key):
@@ -126,7 +146,15 @@ def scale_proportion(obj_box, H, W):

 # Adapted from the parent class `AttnProcessor2_0`
 class AttnProcessorWithHook(AttnProcessor2_0):
-    def __init__(self, attn_processor_key, hidden_size, cross_attention_dim, hook=None, fast_attn=True, enabled=True):
+    def __init__(
+        self,
+        attn_processor_key,
+        hidden_size,
+        cross_attention_dim,
+        hook=None,
+        fast_attn=True,
+        enabled=True,
+    ):
        super().__init__()
        self.attn_processor_key = attn_processor_key
        self.hidden_size = hidden_size
@@ -165,15 +193,16 @@ class AttnProcessorWithHook(AttnProcessor2_0):
        if attn.group_norm is not None:
            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

-        query = attn.to_q(hidden_states, scale=scale)
+        args = () if USE_PEFT_BACKEND else (scale,)
+        query = attn.to_q(hidden_states, *args)

        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

-        key = attn.to_k(encoder_hidden_states, scale=scale)
-        value = attn.to_v(encoder_hidden_states, scale=scale)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)

        inner_dim = key.shape[-1]
        head_dim = inner_dim // attn.heads
@@ -186,7 +215,13 @@ class AttnProcessorWithHook(AttnProcessor2_0):

        if self.hook is not None and self.enabled:
            # Call the hook with query, key, value, and attention maps
-            self.hook(self.attn_processor_key, query_batch_dim, key_batch_dim, value_batch_dim, attention_probs)
+            self.hook(
+                self.attn_processor_key,
+                query_batch_dim,
+                key_batch_dim,
+                value_batch_dim,
+                attention_probs,
+            )

        if self.fast_attn:
            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
@@ -202,7 +237,12 @@ class AttnProcessorWithHook(AttnProcessor2_0):
            # the output of sdp = (batch, num_heads, seq_len, head_dim)
            # TODO: add support for attn.scale when we move to Torch 2.1
            hidden_states = F.scaled_dot_product_attention(
-                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+                query,
+                key,
+                value,
+                attn_mask=attention_mask,
+                dropout_p=0.0,
+                is_causal=False,
            )
            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
            hidden_states = hidden_states.to(query.dtype)
@@ -211,7 +251,7 @@ class AttnProcessorWithHook(AttnProcessor2_0):
            hidden_states = attn.batch_to_head_dim(hidden_states)

        # linear proj
-        hidden_states = attn.to_out[0](hidden_states, scale=scale)
+        hidden_states = attn.to_out[0](hidden_states, *args)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)

@@ -226,7 +266,9 @@ class AttnProcessorWithHook(AttnProcessor2_0):
        return hidden_states


-class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
+class LLMGroundedDiffusionPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+):
    r"""
    Pipeline for layout-grounded text-to-image generation using LLM-grounded Diffusion (LMD+): https://arxiv.org/pdf/2305.13655.pdf.

@@ -257,6 +299,11 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
            Whether a safety checker is needed for this pipeline.
    """

+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
    objects_text = "Objects: "
    bg_prompt_text = "Background prompt: "
    bg_prompt_text_no_trailing_space = bg_prompt_text.rstrip()
@@ -272,12 +319,91 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
    ):
-        super().__init__(
-            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
-        )
+        # This is copied from StableDiffusionPipeline, with hook initizations for LMD+.
+        super().__init__()

+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        # Initialize the attention hooks for LLM-grounded Diffusion
        self.register_attn_hooks(unet)
        self._saved_attn = None

@@ -464,7 +590,14 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):

        return token_map

-    def get_phrase_indices(self, prompt, phrases, token_map=None, add_suffix_if_not_found=False, verbose=False):
+    def get_phrase_indices(
+        self,
+        prompt,
+        phrases,
+        token_map=None,
+        add_suffix_if_not_found=False,
+        verbose=False,
+    ):
        for obj in phrases:
            # Suffix the prompt with object name for attention guidance if object is not in the prompt, using "|" to separate the prompt and the suffix
            if obj not in prompt:
@@ -485,7 +618,14 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
            phrase_token_map_str = " ".join(phrase_token_map)

            if verbose:
-                logger.info("Full str:", token_map_str, "Substr:", phrase_token_map_str, "Phrase:", phrases)
+                logger.info(
+                    "Full str:",
+                    token_map_str,
+                    "Substr:",
+                    phrase_token_map_str,
+                    "Phrase:",
+                    phrases,
+                )

            # Count the number of token before substr
            # The substring comes with a trailing space that needs to be removed by minus one in the index.
@@ -552,7 +692,15 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):

        return loss

-    def compute_ca_loss(self, saved_attn, bboxes, phrase_indices, guidance_attn_keys, verbose=False, **kwargs):
+    def compute_ca_loss(
+        self,
+        saved_attn,
+        bboxes,
+        phrase_indices,
+        guidance_attn_keys,
+        verbose=False,
+        **kwargs,
+    ):
        """
        The `saved_attn` is supposed to be passed to `save_attn_to_dict` in `cross_attention_kwargs` prior to computing ths loss.
        `AttnProcessor` will put attention maps into the `save_attn_to_dict`.
@@ -605,6 +753,7 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -662,6 +811,7 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -724,9 +874,10 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
                phrase_indices = []
                prompt_parsed = []
                for prompt_item in prompt:
-                    phrase_indices_parsed_item, prompt_parsed_item = self.get_phrase_indices(
-                        prompt_item, add_suffix_if_not_found=True
-                    )
+                    (
+                        phrase_indices_parsed_item,
+                        prompt_parsed_item,
+                    ) = self.get_phrase_indices(prompt_item, add_suffix_if_not_found=True)
                    phrase_indices.append(phrase_indices_parsed_item)
                    prompt_parsed.append(prompt_parsed_item)
                prompt = prompt_parsed
@@ -759,6 +910,11 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
        if do_classifier_free_guidance:
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
        # 4. Prepare timesteps
        self.scheduler.set_timesteps(num_inference_steps, device=device)
        timesteps = self.scheduler.timesteps
@@ -801,7 +957,10 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
        if n_objs:
            cond_boxes[:n_objs] = torch.tensor(boxes)
        text_embeddings = torch.zeros(
-            max_objs, self.unet.config.cross_attention_dim, device=device, dtype=self.text_encoder.dtype
+            max_objs,
+            self.unet.config.cross_attention_dim,
+            device=device,
+            dtype=self.text_encoder.dtype,
        )
        if n_objs:
            text_embeddings[:n_objs] = _text_embeddings
@@ -833,6 +992,9 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
        loss_attn = torch.tensor(10000.0)

        # 7. Denoising loop
@@ -869,6 +1031,7 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
                    t,
                    encoder_hidden_states=prompt_embeds,
                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
                ).sample

                # perform guidance
@@ -1013,3 +1176,438 @@ class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
                self.enable_attn_hook(enabled=False)

        return latents, loss
+
+    # Below are methods copied from StableDiffusionPipeline
+    # The design choice of not inheriting from StableDiffusionPipeline is discussed here: https://github.com/huggingface/diffusers/pull/5993#issuecomment-1834258517
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device),
+                    attention_mask=attention_mask,
+                    output_hidden_states=True,
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_scale
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_rescale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.clip_skip
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.do_classifier_free_guidance
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.cross_attention_kwargs
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.num_timesteps
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
--- a/examples/community/lpw_stable_diffusion_xl.py
+++ b/examples/community/lpw_stable_diffusion_xl.py
@@ -11,10 +11,11 @@ import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import torch
+from PIL import Image
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer

 from diffusers import DiffusionPipeline, StableDiffusionXLPipeline
-from diffusers.image_processor import VaeImageProcessor
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.attention_processor import (
@@ -23,7 +24,7 @@ from diffusers.models.attention_processor import (
    LoRAXFormersAttnProcessor,
    XFormersAttnProcessor,
 )
-from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
    is_accelerate_available,
@@ -250,6 +251,7 @@ def get_weighted_text_embeddings_sdxl(
    neg_prompt: str = "",
    neg_prompt_2: str = None,
    num_images_per_prompt: int = 1,
+    device: Optional[torch.device] = None,
 ):
    """
    This function can process long prompt with weights, no length limitation
@@ -262,10 +264,13 @@ def get_weighted_text_embeddings_sdxl(
        neg_prompt (str)
        neg_prompt_2 (str)
        num_images_per_prompt (int)
+        device (torch.device)
    Returns:
        prompt_embeds (torch.Tensor)
        neg_prompt_embeds (torch.Tensor)
    """
+    device = device or pipe._execution_device
+
    if prompt_2:
        prompt = f"{prompt} {prompt_2}"

@@ -330,17 +335,17 @@ def get_weighted_text_embeddings_sdxl(
    # get prompt embeddings one by one is not working.
    for i in range(len(prompt_token_groups)):
        # get positive prompt embeddings with weights
-        token_tensor = torch.tensor([prompt_token_groups[i]], dtype=torch.long, device=pipe.device)
-        weight_tensor = torch.tensor(prompt_weight_groups[i], dtype=torch.float16, device=pipe.device)
+        token_tensor = torch.tensor([prompt_token_groups[i]], dtype=torch.long, device=device)
+        weight_tensor = torch.tensor(prompt_weight_groups[i], dtype=torch.float16, device=device)

-        token_tensor_2 = torch.tensor([prompt_token_groups_2[i]], dtype=torch.long, device=pipe.device)
+        token_tensor_2 = torch.tensor([prompt_token_groups_2[i]], dtype=torch.long, device=device)

        # use first text encoder
-        prompt_embeds_1 = pipe.text_encoder(token_tensor.to(pipe.device), output_hidden_states=True)
+        prompt_embeds_1 = pipe.text_encoder(token_tensor.to(device), output_hidden_states=True)
        prompt_embeds_1_hidden_states = prompt_embeds_1.hidden_states[-2]

        # use second text encoder
-        prompt_embeds_2 = pipe.text_encoder_2(token_tensor_2.to(pipe.device), output_hidden_states=True)
+        prompt_embeds_2 = pipe.text_encoder_2(token_tensor_2.to(device), output_hidden_states=True)
        prompt_embeds_2_hidden_states = prompt_embeds_2.hidden_states[-2]
        pooled_prompt_embeds = prompt_embeds_2[0]

@@ -357,16 +362,16 @@ def get_weighted_text_embeddings_sdxl(
        embeds.append(token_embedding)

        # get negative prompt embeddings with weights
-        neg_token_tensor = torch.tensor([neg_prompt_token_groups[i]], dtype=torch.long, device=pipe.device)
-        neg_token_tensor_2 = torch.tensor([neg_prompt_token_groups_2[i]], dtype=torch.long, device=pipe.device)
-        neg_weight_tensor = torch.tensor(neg_prompt_weight_groups[i], dtype=torch.float16, device=pipe.device)
+        neg_token_tensor = torch.tensor([neg_prompt_token_groups[i]], dtype=torch.long, device=device)
+        neg_token_tensor_2 = torch.tensor([neg_prompt_token_groups_2[i]], dtype=torch.long, device=device)
+        neg_weight_tensor = torch.tensor(neg_prompt_weight_groups[i], dtype=torch.float16, device=device)

        # use first text encoder
-        neg_prompt_embeds_1 = pipe.text_encoder(neg_token_tensor.to(pipe.device), output_hidden_states=True)
+        neg_prompt_embeds_1 = pipe.text_encoder(neg_token_tensor.to(device), output_hidden_states=True)
        neg_prompt_embeds_1_hidden_states = neg_prompt_embeds_1.hidden_states[-2]

        # use second text encoder
-        neg_prompt_embeds_2 = pipe.text_encoder_2(neg_token_tensor_2.to(pipe.device), output_hidden_states=True)
+        neg_prompt_embeds_2 = pipe.text_encoder_2(neg_token_tensor_2.to(device), output_hidden_states=True)
        neg_prompt_embeds_2_hidden_states = neg_prompt_embeds_2.hidden_states[-2]
        negative_pooled_prompt_embeds = neg_prompt_embeds_2[0]

@@ -457,6 +462,65 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
    return noise_cfg


+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
 class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -522,6 +586,9 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
        self.default_sample_size = self.unet.config.sample_size

        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
@@ -809,6 +876,7 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
        prompt_2,
        height,
        width,
+        strength,
        callback_steps,
        negative_prompt=None,
        negative_prompt_2=None,
@@ -820,6 +888,9 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
        if (callback_steps is None) or (
            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
        ):
@@ -876,23 +947,263 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
            )

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+        else:
+            t_start = 0
+
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        # Strength is irrelevant if we directly request a timestep to start at;
+        # that is, strength is determined by the denoising_start instead.
+        if denoising_start is not None:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
            )

-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
+            num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1

-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            timesteps = timesteps[-num_inference_steps:]
+            return timesteps, num_inference_steps
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self,
+        image,
+        mask,
+        width,
+        height,
+        num_channels_latents,
+        timestep,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        generator=None,
+        add_noise=True,
+        latents=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        batch_size *= num_images_per_prompt
+
+        if image is None:
+            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if latents is None:
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            else:
+                latents = latents.to(device)
+
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+            return latents
+
+        elif mask is None:
+            if not isinstance(image, (torch.Tensor, Image.Image, list)):
+                raise ValueError(
+                    f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+                )
+
+            # Offload text encoder if `enable_model_cpu_offload` was enabled
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.text_encoder_2.to("cpu")
+                torch.cuda.empty_cache()
+
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                init_latents = image
+
+            else:
+                # make sure the VAE is in float32 mode, as it overflows in float16
+                if self.vae.config.force_upcast:
+                    image = image.float()
+                    self.vae.to(dtype=torch.float32)
+
+                if isinstance(generator, list) and len(generator) != batch_size:
+                    raise ValueError(
+                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                    )
+
+                elif isinstance(generator, list):
+                    init_latents = [
+                        retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                        for i in range(batch_size)
+                    ]
+                    init_latents = torch.cat(init_latents, dim=0)
+                else:
+                    init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+                if self.vae.config.force_upcast:
+                    self.vae.to(dtype)
+
+                init_latents = init_latents.to(dtype)
+                init_latents = self.vae.config.scaling_factor * init_latents
+
+            if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                additional_image_per_prompt = batch_size // init_latents.shape[0]
+                init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+            elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                init_latents = torch.cat([init_latents], dim=0)
+
+            if add_noise:
+                shape = init_latents.shape
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                # get latents
+                init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+            latents = init_latents
+            return latents
+
+        else:
+            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if (image is None or timestep is None) and not is_strength_max:
+                raise ValueError(
+                    "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                    "However, either the image or the noise timestep has not been provided."
+                )
+
+            if image.shape[1] == 4:
+                image_latents = image.to(device=device, dtype=dtype)
+                image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+            elif return_image_latents or (latents is None and not is_strength_max):
+                image = image.to(device=device, dtype=dtype)
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+                image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+            if latents is None and add_noise:
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                # if strength is 1. then initialise the latents to noise, else initial to image + noise
+                latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+                # if pure noise then scale the initial latents by the  Scheduler's init sigma
+                latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+            elif add_noise:
+                noise = latents.to(device)
+                latents = noise * self.scheduler.init_noise_sigma
+            else:
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                latents = image_latents.to(device)
+
+            outputs = (latents,)
+
+            if return_noise:
+                outputs += (noise,)
+
+            if return_image_latents:
+                outputs += (image_latents,)
+
+            return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        if self.vae.config.force_upcast:
+            image = image.float()
+            self.vae.to(dtype=torch.float32)
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        if self.vae.config.force_upcast:
+            self.vae.to(dtype)
+
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+
+        if masked_image is not None and masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = None
+
+        if masked_image is not None:
+            if masked_image_latents is None:
+                masked_image = masked_image.to(device=device, dtype=dtype)
+                masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+
+            masked_image_latents = (
+                torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+            )
+
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        return mask, masked_image_latents

    def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
        add_time_ids = list(original_size + crops_coords_top_left + target_size)
@@ -930,15 +1241,52 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
            self.vae.decoder.conv_in.to(dtype)
            self.vae.decoder.mid_block.to(dtype)

+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
    @torch.no_grad()
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
        prompt: str = None,
        prompt_2: Optional[str] = None,
+        image: Optional[PipelineImageInput] = None,
+        mask_image: Optional[PipelineImageInput] = None,
+        masked_image_latents: Optional[torch.FloatTensor] = None,
        height: Optional[int] = None,
        width: Optional[int] = None,
+        strength: float = 0.8,
        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
        denoising_end: Optional[float] = None,
        guidance_scale: float = 5.0,
        negative_prompt: Optional[str] = None,
@@ -971,20 +1319,46 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
            prompt_2 (`str`):
                The prompt to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in both text-encoders
+            image (`PipelineImageInput`, *optional*):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`PipelineImageInput`, *optional*):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
            denoising_end (`float`, *optional*):
                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
                completed before it is intentionally prematurely terminated. As a result, the returned sample will
-                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
-                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
-                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
-                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
            guidance_scale (`float`, *optional*, defaults to 5.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -1080,6 +1454,7 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
            prompt_2,
            height,
            width,
+            strength,
            callback_steps,
            negative_prompt,
            negative_prompt_2,
@@ -1089,6 +1464,12 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
            negative_pooled_prompt_embeds,
        )

+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+
        # 2. Define call parameters
        if prompt is not None and isinstance(prompt, str):
            batch_size = 1
@@ -1117,28 +1498,126 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
        ) = get_weighted_text_embeddings_sdxl(
            pipe=self, prompt=prompt, neg_prompt=negative_prompt, num_images_per_prompt=num_images_per_prompt
        )
+        dtype = prompt_embeds.dtype
+
+        if isinstance(image, Image.Image):
+            image = self.image_processor.preprocess(image, height=height, width=width)
+        if image is not None:
+            image = image.to(device=self.device, dtype=dtype)
+
+        if isinstance(mask_image, Image.Image):
+            mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
+        else:
+            mask = mask_image
+        if mask_image is not None:
+            mask = mask.to(device=self.device, dtype=dtype)
+
+            if masked_image_latents is not None:
+                masked_image = masked_image_latents
+            elif image.shape[1] == 4:
+                # if image is in latent space, we can't mask it
+                masked_image = None
+            else:
+                masked_image = image * (mask < 0.5)
+        else:
+            mask = None

        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        def denoising_value_valid(dnv):
+            return isinstance(self.denoising_end, float) and 0 < dnv < 1

-        timesteps = self.scheduler.timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        if image is not None:
+            timesteps, num_inference_steps = self.get_timesteps(
+                num_inference_steps,
+                strength,
+                device,
+                denoising_start=self.denoising_start if denoising_value_valid else None,
+            )
+
+            # check that number of inference steps is not < 1 - as this doesn't make sense
+            if num_inference_steps < 1:
+                raise ValueError(
+                    f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                    f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+                )
+
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        is_strength_max = strength == 1.0
+        add_noise = True if self.denoising_start is None else False

        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
+            image=image,
+            mask=mask,
+            width=width,
+            height=height,
+            num_channels_latents=num_channels_unet,
+            timestep=latent_timestep,
+            batch_size=batch_size,
+            num_images_per_prompt=num_images_per_prompt,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            add_noise=add_noise,
+            latents=latents,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
        )

+        if mask is not None:
+            if return_image_latents:
+                latents, noise, image_latents = latents
+            else:
+                latents, noise = latents
+
+        # 5.1. Prepare mask latent variables
+        if mask is not None:
+            mask, masked_image_latents = self.prepare_mask_latents(
+                mask=mask,
+                masked_image=masked_image,
+                batch_size=batch_size * num_images_per_prompt,
+                height=height,
+                width=width,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+            # 8. Check that sizes of mask, masked image and latents match
+            if num_channels_unet == 9:
+                # default case for runwayml/stable-diffusion-inpainting
+                num_channels_mask = mask.shape[1]
+                num_channels_masked_image = masked_image_latents.shape[1]
+                if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet:
+                    raise ValueError(
+                        f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                        f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                        f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                        f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                        " `pipeline.unet` or your `mask_image` or `image` input."
+                    )
+            elif num_channels_unet != 4:
+                raise ValueError(
+                    f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+                )
+
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
        # 7. Prepare added time ids & embeddings
        add_text_embeds = pooled_prompt_embeds
        add_time_ids = self._get_add_time_ids(
@@ -1154,20 +1633,41 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
        add_text_embeds = add_text_embeds.to(device)
        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)

-        # 8. Denoising loop
        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)

        # 7.1 Apply denoising_end
-        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
            discrete_timestep_cutoff = int(
                round(
                    self.scheduler.config.num_train_timesteps
-                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
                )
            )
            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
            timesteps = timesteps[:num_inference_steps]

+        # 8. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+
+        # 9. Denoising loop
        with self.progress_bar(total=num_inference_steps) as progress_bar:
            for i, t in enumerate(timesteps):
                # expand the latents if we are doing classifier free guidance
@@ -1175,13 +1675,17 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo

                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

+                if mask is not None and num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
                # predict the noise residual
                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
                noise_pred = self.unet(
                    latent_model_input,
                    t,
                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
                    added_cond_kwargs=added_cond_kwargs,
                    return_dict=False,
                )[0]
@@ -1198,6 +1702,22 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
                # compute the previous noisy sample x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

+                if mask is not None and num_channels_unet == 4:
+                    init_latents_proper = image_latents
+
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()
@@ -1237,6 +1757,204 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo

        return StableDiffusionXLPipelineOutput(images=image)

+    def text2img(
+        self,
+        prompt: str = None,
+        prompt_2: Optional[str] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+    ):
+        return self.__call__(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            denoising_start=denoising_start,
+            denoising_end=denoising_end,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
+            guidance_rescale=guidance_rescale,
+            original_size=original_size,
+            crops_coords_top_left=crops_coords_top_left,
+            target_size=target_size,
+        )
+
+    def img2img(
+        self,
+        prompt: str = None,
+        prompt_2: Optional[str] = None,
+        image: Optional[PipelineImageInput] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+    ):
+        return self.__call__(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            image=image,
+            height=height,
+            width=width,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            denoising_start=denoising_start,
+            denoising_end=denoising_end,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
+            guidance_rescale=guidance_rescale,
+            original_size=original_size,
+            crops_coords_top_left=crops_coords_top_left,
+            target_size=target_size,
+        )
+
+    def inpaint(
+        self,
+        prompt: str = None,
+        prompt_2: Optional[str] = None,
+        image: Optional[PipelineImageInput] = None,
+        mask_image: Optional[PipelineImageInput] = None,
+        masked_image_latents: Optional[torch.FloatTensor] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+    ):
+        return self.__call__(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            image=image,
+            mask_image=mask_image,
+            masked_image_latents=masked_image_latents,
+            height=height,
+            width=width,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            denoising_start=denoising_start,
+            denoising_end=denoising_end,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
+            guidance_rescale=guidance_rescale,
+            original_size=original_size,
+            crops_coords_top_left=crops_coords_top_left,
+            target_size=target_size,
+        )
+
    # Overrride to properly handle the loading and unloading of the additional text encoder.
    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
        # We could have accessed the unet config from `lora_state_dict()` too. We pass
--- a/examples/community/marigold_depth_estimation.py
+++ b/examples/community/marigold_depth_estimation.py
@@ -0,0 +1,602 @@
+# Copyright 2023 Bingxin Ke, ETH Zurich and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+
+
+import math
+from typing import Dict, Union
+
+import matplotlib
+import numpy as np
+import torch
+from PIL import Image
+from scipy.optimize import minimize
+from torch.utils.data import DataLoader, TensorDataset
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils import BaseOutput, check_min_version
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.20.1.dev0")
+
+
+class MarigoldDepthOutput(BaseOutput):
+    """
+    Output class for Marigold monocular depth prediction pipeline.
+
+    Args:
+        depth_np (`np.ndarray`):
+            Predicted depth map, with depth values in the range of [0, 1].
+        depth_colored (`PIL.Image.Image`):
+            Colorized depth map, with the shape of [3, H, W] and values in [0, 1].
+        uncertainty (`None` or `np.ndarray`):
+            Uncalibrated uncertainty(MAD, median absolute deviation) coming from ensembling.
+    """
+
+    depth_np: np.ndarray
+    depth_colored: Image.Image
+    uncertainty: Union[None, np.ndarray]
+
+
+class MarigoldPipeline(DiffusionPipeline):
+    """
+    Pipeline for monocular depth estimation using Marigold: https://marigoldmonodepth.github.io.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        unet (`UNet2DConditionModel`):
+            Conditional U-Net to denoise the depth latent, conditioned on image latent.
+        vae (`AutoencoderKL`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images and depth maps
+            to and from latent representations.
+        scheduler (`DDIMScheduler`):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        text_encoder (`CLIPTextModel`):
+            Text-encoder, for empty text embedding.
+        tokenizer (`CLIPTokenizer`):
+            CLIP tokenizer.
+    """
+
+    rgb_latent_scale_factor = 0.18215
+    depth_latent_scale_factor = 0.18215
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: DDIMScheduler,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            vae=vae,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+
+        self.empty_text_embed = None
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_image: Image,
+        denoising_steps: int = 10,
+        ensemble_size: int = 10,
+        processing_res: int = 768,
+        match_input_res: bool = True,
+        batch_size: int = 0,
+        color_map: str = "Spectral",
+        show_progress_bar: bool = True,
+        ensemble_kwargs: Dict = None,
+    ) -> MarigoldDepthOutput:
+        """
+        Function invoked when calling the pipeline.
+
+        Args:
+            input_image (`Image`):
+                Input RGB (or gray-scale) image.
+            processing_res (`int`, *optional*, defaults to `768`):
+                Maximum resolution of processing.
+                If set to 0: will not resize at all.
+            match_input_res (`bool`, *optional*, defaults to `True`):
+                Resize depth prediction to match input resolution.
+                Only valid if `limit_input_res` is not None.
+            denoising_steps (`int`, *optional*, defaults to `10`):
+                Number of diffusion denoising steps (DDIM) during inference.
+            ensemble_size (`int`, *optional*, defaults to `10`):
+                Number of predictions to be ensembled.
+            batch_size (`int`, *optional*, defaults to `0`):
+                Inference batch size, no bigger than `num_ensemble`.
+                If set to 0, the script will automatically decide the proper batch size.
+            show_progress_bar (`bool`, *optional*, defaults to `True`):
+                Display a progress bar of diffusion denoising.
+            color_map (`str`, *optional*, defaults to `"Spectral"`):
+                Colormap used to colorize the depth map.
+            ensemble_kwargs (`dict`, *optional*, defaults to `None`):
+                Arguments for detailed ensembling settings.
+        Returns:
+            `MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
+            - **depth_np** (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1]
+            - **depth_colored** (`PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and values in [0, 1]
+            - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
+                    coming from ensembling. None if `ensemble_size = 1`
+        """
+
+        device = self.device
+        input_size = input_image.size
+
+        if not match_input_res:
+            assert processing_res is not None, "Value error: `resize_output_back` is only valid with "
+        assert processing_res >= 0
+        assert denoising_steps >= 1
+        assert ensemble_size >= 1
+
+        # ----------------- Image Preprocess -----------------
+        # Resize image
+        if processing_res > 0:
+            input_image = self.resize_max_res(input_image, max_edge_resolution=processing_res)
+        # Convert the image to RGB, to 1.remove the alpha channel 2.convert B&W to 3-channel
+        input_image = input_image.convert("RGB")
+        image = np.asarray(input_image)
+
+        # Normalize rgb values
+        rgb = np.transpose(image, (2, 0, 1))  # [H, W, rgb] -> [rgb, H, W]
+        rgb_norm = rgb / 255.0
+        rgb_norm = torch.from_numpy(rgb_norm).to(self.dtype)
+        rgb_norm = rgb_norm.to(device)
+        assert rgb_norm.min() >= 0.0 and rgb_norm.max() <= 1.0
+
+        # ----------------- Predicting depth -----------------
+        # Batch repeated input image
+        duplicated_rgb = torch.stack([rgb_norm] * ensemble_size)
+        single_rgb_dataset = TensorDataset(duplicated_rgb)
+        if batch_size > 0:
+            _bs = batch_size
+        else:
+            _bs = self._find_batch_size(
+                ensemble_size=ensemble_size,
+                input_res=max(rgb_norm.shape[1:]),
+                dtype=self.dtype,
+            )
+
+        single_rgb_loader = DataLoader(single_rgb_dataset, batch_size=_bs, shuffle=False)
+
+        # Predict depth maps (batched)
+        depth_pred_ls = []
+        if show_progress_bar:
+            iterable = tqdm(single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False)
+        else:
+            iterable = single_rgb_loader
+        for batch in iterable:
+            (batched_img,) = batch
+            depth_pred_raw = self.single_infer(
+                rgb_in=batched_img,
+                num_inference_steps=denoising_steps,
+                show_pbar=show_progress_bar,
+            )
+            depth_pred_ls.append(depth_pred_raw.detach().clone())
+        depth_preds = torch.concat(depth_pred_ls, axis=0).squeeze()
+        torch.cuda.empty_cache()  # clear vram cache for ensembling
+
+        # ----------------- Test-time ensembling -----------------
+        if ensemble_size > 1:
+            depth_pred, pred_uncert = self.ensemble_depths(depth_preds, **(ensemble_kwargs or {}))
+        else:
+            depth_pred = depth_preds
+            pred_uncert = None
+
+        # ----------------- Post processing -----------------
+        # Scale prediction to [0, 1]
+        min_d = torch.min(depth_pred)
+        max_d = torch.max(depth_pred)
+        depth_pred = (depth_pred - min_d) / (max_d - min_d)
+
+        # Convert to numpy
+        depth_pred = depth_pred.cpu().numpy().astype(np.float32)
+
+        # Resize back to original resolution
+        if match_input_res:
+            pred_img = Image.fromarray(depth_pred)
+            pred_img = pred_img.resize(input_size)
+            depth_pred = np.asarray(pred_img)
+
+        # Clip output range
+        depth_pred = depth_pred.clip(0, 1)
+
+        # Colorize
+        depth_colored = self.colorize_depth_maps(
+            depth_pred, 0, 1, cmap=color_map
+        ).squeeze()  # [3, H, W], value in (0, 1)
+        depth_colored = (depth_colored * 255).astype(np.uint8)
+        depth_colored_hwc = self.chw2hwc(depth_colored)
+        depth_colored_img = Image.fromarray(depth_colored_hwc)
+        return MarigoldDepthOutput(
+            depth_np=depth_pred,
+            depth_colored=depth_colored_img,
+            uncertainty=pred_uncert,
+        )
+
+    def _encode_empty_text(self):
+        """
+        Encode text embedding for empty prompt.
+        """
+        prompt = ""
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="do_not_pad",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
+        self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)
+
+    @torch.no_grad()
+    def single_infer(self, rgb_in: torch.Tensor, num_inference_steps: int, show_pbar: bool) -> torch.Tensor:
+        """
+        Perform an individual depth prediction without ensembling.
+
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image.
+            num_inference_steps (`int`):
+                Number of diffusion denoisign steps (DDIM) during inference.
+            show_pbar (`bool`):
+                Display a progress bar of diffusion denoising.
+        Returns:
+            `torch.Tensor`: Predicted depth map.
+        """
+        device = rgb_in.device
+
+        # Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps  # [T]
+
+        # Encode image
+        rgb_latent = self._encode_rgb(rgb_in)
+
+        # Initial depth map (noise)
+        depth_latent = torch.randn(rgb_latent.shape, device=device, dtype=self.dtype)  # [B, 4, h, w]
+
+        # Batched empty text embedding
+        if self.empty_text_embed is None:
+            self._encode_empty_text()
+        batch_empty_text_embed = self.empty_text_embed.repeat((rgb_latent.shape[0], 1, 1))  # [B, 2, 1024]
+
+        # Denoising loop
+        if show_pbar:
+            iterable = tqdm(
+                enumerate(timesteps),
+                total=len(timesteps),
+                leave=False,
+                desc=" " * 4 + "Diffusion denoising",
+            )
+        else:
+            iterable = enumerate(timesteps)
+
+        for i, t in iterable:
+            unet_input = torch.cat([rgb_latent, depth_latent], dim=1)  # this order is important
+
+            # predict the noise residual
+            noise_pred = self.unet(unet_input, t, encoder_hidden_states=batch_empty_text_embed).sample  # [B, 4, h, w]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            depth_latent = self.scheduler.step(noise_pred, t, depth_latent).prev_sample
+        torch.cuda.empty_cache()
+        depth = self._decode_depth(depth_latent)
+
+        # clip prediction
+        depth = torch.clip(depth, -1.0, 1.0)
+        # shift to [0, 1]
+        depth = (depth + 1.0) / 2.0
+
+        return depth
+
+    def _encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
+        """
+        Encode RGB image into latent.
+
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image to be encoded.
+
+        Returns:
+            `torch.Tensor`: Image latent.
+        """
+        # encode
+        h = self.vae.encoder(rgb_in)
+        moments = self.vae.quant_conv(h)
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        # scale latent
+        rgb_latent = mean * self.rgb_latent_scale_factor
+        return rgb_latent
+
+    def _decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
+        """
+        Decode depth latent into depth map.
+
+        Args:
+            depth_latent (`torch.Tensor`):
+                Depth latent to be decoded.
+
+        Returns:
+            `torch.Tensor`: Decoded depth map.
+        """
+        # scale latent
+        depth_latent = depth_latent / self.depth_latent_scale_factor
+        # decode
+        z = self.vae.post_quant_conv(depth_latent)
+        stacked = self.vae.decoder(z)
+        # mean of output channels
+        depth_mean = stacked.mean(dim=1, keepdim=True)
+        return depth_mean
+
+    @staticmethod
+    def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image:
+        """
+        Resize image to limit maximum edge length while keeping aspect ratio.
+
+        Args:
+            img (`Image.Image`):
+                Image to be resized.
+            max_edge_resolution (`int`):
+                Maximum edge length (pixel).
+
+        Returns:
+            `Image.Image`: Resized image.
+        """
+        original_width, original_height = img.size
+        downscale_factor = min(max_edge_resolution / original_width, max_edge_resolution / original_height)
+
+        new_width = int(original_width * downscale_factor)
+        new_height = int(original_height * downscale_factor)
+
+        resized_img = img.resize((new_width, new_height))
+        return resized_img
+
+    @staticmethod
+    def colorize_depth_maps(depth_map, min_depth, max_depth, cmap="Spectral", valid_mask=None):
+        """
+        Colorize depth maps.
+        """
+        assert len(depth_map.shape) >= 2, "Invalid dimension"
+
+        if isinstance(depth_map, torch.Tensor):
+            depth = depth_map.detach().clone().squeeze().numpy()
+        elif isinstance(depth_map, np.ndarray):
+            depth = depth_map.copy().squeeze()
+        # reshape to [ (B,) H, W ]
+        if depth.ndim < 3:
+            depth = depth[np.newaxis, :, :]
+
+        # colorize
+        cm = matplotlib.colormaps[cmap]
+        depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1)
+        img_colored_np = cm(depth, bytes=False)[:, :, :, 0:3]  # value from 0 to 1
+        img_colored_np = np.rollaxis(img_colored_np, 3, 1)
+
+        if valid_mask is not None:
+            if isinstance(depth_map, torch.Tensor):
+                valid_mask = valid_mask.detach().numpy()
+            valid_mask = valid_mask.squeeze()  # [H, W] or [B, H, W]
+            if valid_mask.ndim < 3:
+                valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
+            else:
+                valid_mask = valid_mask[:, np.newaxis, :, :]
+            valid_mask = np.repeat(valid_mask, 3, axis=1)
+            img_colored_np[~valid_mask] = 0
+
+        if isinstance(depth_map, torch.Tensor):
+            img_colored = torch.from_numpy(img_colored_np).float()
+        elif isinstance(depth_map, np.ndarray):
+            img_colored = img_colored_np
+
+        return img_colored
+
+    @staticmethod
+    def chw2hwc(chw):
+        assert 3 == len(chw.shape)
+        if isinstance(chw, torch.Tensor):
+            hwc = torch.permute(chw, (1, 2, 0))
+        elif isinstance(chw, np.ndarray):
+            hwc = np.moveaxis(chw, 0, -1)
+        return hwc
+
+    @staticmethod
+    def _find_batch_size(ensemble_size: int, input_res: int, dtype: torch.dtype) -> int:
+        """
+        Automatically search for suitable operating batch size.
+
+        Args:
+            ensemble_size (`int`):
+                Number of predictions to be ensembled.
+            input_res (`int`):
+                Operating resolution of the input image.
+
+        Returns:
+            `int`: Operating batch size.
+        """
+        # Search table for suggested max. inference batch size
+        bs_search_table = [
+            # tested on A100-PCIE-80GB
+            {"res": 768, "total_vram": 79, "bs": 35, "dtype": torch.float32},
+            {"res": 1024, "total_vram": 79, "bs": 20, "dtype": torch.float32},
+            # tested on A100-PCIE-40GB
+            {"res": 768, "total_vram": 39, "bs": 15, "dtype": torch.float32},
+            {"res": 1024, "total_vram": 39, "bs": 8, "dtype": torch.float32},
+            {"res": 768, "total_vram": 39, "bs": 30, "dtype": torch.float16},
+            {"res": 1024, "total_vram": 39, "bs": 15, "dtype": torch.float16},
+            # tested on RTX3090, RTX4090
+            {"res": 512, "total_vram": 23, "bs": 20, "dtype": torch.float32},
+            {"res": 768, "total_vram": 23, "bs": 7, "dtype": torch.float32},
+            {"res": 1024, "total_vram": 23, "bs": 3, "dtype": torch.float32},
+            {"res": 512, "total_vram": 23, "bs": 40, "dtype": torch.float16},
+            {"res": 768, "total_vram": 23, "bs": 18, "dtype": torch.float16},
+            {"res": 1024, "total_vram": 23, "bs": 10, "dtype": torch.float16},
+            # tested on GTX1080Ti
+            {"res": 512, "total_vram": 10, "bs": 5, "dtype": torch.float32},
+            {"res": 768, "total_vram": 10, "bs": 2, "dtype": torch.float32},
+            {"res": 512, "total_vram": 10, "bs": 10, "dtype": torch.float16},
+            {"res": 768, "total_vram": 10, "bs": 5, "dtype": torch.float16},
+            {"res": 1024, "total_vram": 10, "bs": 3, "dtype": torch.float16},
+        ]
+
+        if not torch.cuda.is_available():
+            return 1
+
+        total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3
+        filtered_bs_search_table = [s for s in bs_search_table if s["dtype"] == dtype]
+        for settings in sorted(
+            filtered_bs_search_table,
+            key=lambda k: (k["res"], -k["total_vram"]),
+        ):
+            if input_res <= settings["res"] and total_vram >= settings["total_vram"]:
+                bs = settings["bs"]
+                if bs > ensemble_size:
+                    bs = ensemble_size
+                elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size:
+                    bs = math.ceil(ensemble_size / 2)
+                return bs
+
+        return 1
+
+    @staticmethod
+    def ensemble_depths(
+        input_images: torch.Tensor,
+        regularizer_strength: float = 0.02,
+        max_iter: int = 2,
+        tol: float = 1e-3,
+        reduction: str = "median",
+        max_res: int = None,
+    ):
+        """
+        To ensemble multiple affine-invariant depth images (up to scale and shift),
+            by aligning estimating the scale and shift
+        """
+
+        def inter_distances(tensors: torch.Tensor):
+            """
+            To calculate the distance between each two depth maps.
+            """
+            distances = []
+            for i, j in torch.combinations(torch.arange(tensors.shape[0])):
+                arr1 = tensors[i : i + 1]
+                arr2 = tensors[j : j + 1]
+                distances.append(arr1 - arr2)
+            dist = torch.concatenate(distances, dim=0)
+            return dist
+
+        device = input_images.device
+        dtype = input_images.dtype
+        np_dtype = np.float32
+
+        original_input = input_images.clone()
+        n_img = input_images.shape[0]
+        ori_shape = input_images.shape
+
+        if max_res is not None:
+            scale_factor = torch.min(max_res / torch.tensor(ori_shape[-2:]))
+            if scale_factor < 1:
+                downscaler = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest")
+                input_images = downscaler(torch.from_numpy(input_images)).numpy()
+
+        # init guess
+        _min = np.min(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
+        _max = np.max(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
+        s_init = 1.0 / (_max - _min).reshape((-1, 1, 1))
+        t_init = (-1 * s_init.flatten() * _min.flatten()).reshape((-1, 1, 1))
+        x = np.concatenate([s_init, t_init]).reshape(-1).astype(np_dtype)
+
+        input_images = input_images.to(device)
+
+        # objective function
+        def closure(x):
+            l = len(x)
+            s = x[: int(l / 2)]
+            t = x[int(l / 2) :]
+            s = torch.from_numpy(s).to(dtype=dtype).to(device)
+            t = torch.from_numpy(t).to(dtype=dtype).to(device)
+
+            transformed_arrays = input_images * s.view((-1, 1, 1)) + t.view((-1, 1, 1))
+            dists = inter_distances(transformed_arrays)
+            sqrt_dist = torch.sqrt(torch.mean(dists**2))
+
+            if "mean" == reduction:
+                pred = torch.mean(transformed_arrays, dim=0)
+            elif "median" == reduction:
+                pred = torch.median(transformed_arrays, dim=0).values
+            else:
+                raise ValueError
+
+            near_err = torch.sqrt((0 - torch.min(pred)) ** 2)
+            far_err = torch.sqrt((1 - torch.max(pred)) ** 2)
+
+            err = sqrt_dist + (near_err + far_err) * regularizer_strength
+            err = err.detach().cpu().numpy().astype(np_dtype)
+            return err
+
+        res = minimize(
+            closure,
+            x,
+            method="BFGS",
+            tol=tol,
+            options={"maxiter": max_iter, "disp": False},
+        )
+        x = res.x
+        l = len(x)
+        s = x[: int(l / 2)]
+        t = x[int(l / 2) :]
+
+        # Prediction
+        s = torch.from_numpy(s).to(dtype=dtype).to(device)
+        t = torch.from_numpy(t).to(dtype=dtype).to(device)
+        transformed_arrays = original_input * s.view(-1, 1, 1) + t.view(-1, 1, 1)
+        if "mean" == reduction:
+            aligned_images = torch.mean(transformed_arrays, dim=0)
+            std = torch.std(transformed_arrays, dim=0)
+            uncertainty = std
+        elif "median" == reduction:
+            aligned_images = torch.median(transformed_arrays, dim=0).values
+            # MAD (median absolute deviation) as uncertainty indicator
+            abs_dev = torch.abs(transformed_arrays - aligned_images)
+            mad = torch.median(abs_dev, dim=0).values
+            uncertainty = mad
+        else:
+            raise ValueError(f"Unknown reduction method: {reduction}")
+
+        # Scale and shift to [0, 1]
+        _min = torch.min(aligned_images)
+        _max = torch.max(aligned_images)
+        aligned_images = (aligned_images - _min) / (_max - _min)
+        uncertainty /= _max - _min
+
+        return aligned_images, uncertainty
--- a/examples/community/pipeline_animatediff_controlnet.py
+++ b/examples/community/pipeline_animatediff_controlnet.py
--- a/examples/community/pipeline_demofusion_sdxl.py
+++ b/examples/community/pipeline_demofusion_sdxl.py
--- a/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py
+++ b/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py
@@ -0,0 +1,772 @@
+# Copyright 2023 The Intel Labs Team Authors and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import DiffusionPipeline
+from diffusers.image_processor import PipelineDepthInput, PipelineImageInput, VaeImageProcessorLDM3D
+from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d import LDM3DPipelineOutput
+from diffusers.schedulers import DDPMScheduler, KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> from diffusers import StableDiffusionUpscaleLDM3DPipeline
+        >>> from PIL import Image
+        >>> from io import BytesIO
+        >>> import requests
+
+        >>> pipe = StableDiffusionUpscaleLDM3DPipeline.from_pretrained("Intel/ldm3d-sr")
+        >>> pipe = pipe.to("cuda")
+        >>> rgb_path = "https://huggingface.co/Intel/ldm3d-sr/resolve/main/lemons_ldm3d_rgb.jpg"
+        >>> depth_path = "https://huggingface.co/Intel/ldm3d-sr/resolve/main/lemons_ldm3d_depth.png"
+        >>> low_res_rgb = Image.open(BytesIO(requests.get(rgb_path).content)).convert("RGB")
+        >>> low_res_depth = Image.open(BytesIO(requests.get(depth_path).content)).convert("L")
+        >>> output = pipe(
+        ...     prompt="high quality high resolution uhd 4k image",
+        ...     rgb=low_res_rgb,
+        ...     depth=low_res_depth,
+        ...     num_inference_steps=50,
+        ...     target_res=[1024, 1024],
+        ... )
+        >>> rgb_image, depth_image = output.rgb, output.depth
+        >>> rgb_image[0].save("hr_ldm3d_rgb.jpg")
+        >>> depth_image[0].save("hr_ldm3d_depth.png")
+        ```
+"""
+
+
+class StableDiffusionUpscaleLDM3DPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-to-image and 3D generation using LDM3D.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        low_res_scheduler ([`SchedulerMixin`]):
+            A scheduler used to add initial noise to the low resolution conditioning image. It must be an instance of
+            [`DDPMScheduler`].
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        low_res_scheduler: DDPMScheduler,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        watermarker: Optional[Any] = None,
+        max_noise_level: int = 350,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            watermarker=watermarker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor, resample="bilinear")
+        # self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.register_to_config(max_noise_level=max_noise_level)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d.StableDiffusionLDM3DPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d.StableDiffusionLDM3DPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            rgb_feature_extractor_input = feature_extractor_input[0]
+            safety_checker_input = self.feature_extractor(rgb_feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        noise_level,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        target_res=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, np.ndarray)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `np.ndarray`, `PIL.Image.Image` or `list` but is {type(image)}"
+            )
+
+        # verify batch size of prompt and image are same if image is a list or tensor or numpy array
+        if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray):
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            if isinstance(image, list):
+                image_batch_size = len(image)
+            else:
+                image_batch_size = image.shape[0]
+            if batch_size != image_batch_size:
+                raise ValueError(
+                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
+                    " Please make sure that passed `prompt` matches the batch size of `image`."
+                )
+
+        # check noise level
+        if noise_level > self.config.max_noise_level:
+            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # def upcast_vae(self):
+    #     dtype = self.vae.dtype
+    #     self.vae.to(dtype=torch.float32)
+    #     use_torch_2_0_or_xformers = isinstance(
+    #         self.vae.decoder.mid_block.attentions[0].processor,
+    #         (
+    #             AttnProcessor2_0,
+    #             XFormersAttnProcessor,
+    #             LoRAXFormersAttnProcessor,
+    #             LoRAAttnProcessor2_0,
+    #         ),
+    #     )
+    #     # if xformers or torch_2_0 is used attention block does not need
+    #     # to be in float32 which can save lots of memory
+    #     if use_torch_2_0_or_xformers:
+    #         self.vae.post_quant_conv.to(dtype)
+    #         self.vae.decoder.conv_in.to(dtype)
+    #         self.vae.decoder.mid_block.to(dtype)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        rgb: PipelineImageInput = None,
+        depth: PipelineDepthInput = None,
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        noise_level: int = 20,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        target_res: Optional[List[int]] = [1024, 1024],
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be upscaled.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            rgb,
+            noise_level,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Preprocess image
+        rgb, depth = self.image_processor.preprocess(rgb, depth, target_res=target_res)
+        rgb = rgb.to(dtype=prompt_embeds.dtype, device=device)
+        depth = depth.to(dtype=prompt_embeds.dtype, device=device)
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Encode low resolutiom image to latent space
+        image = torch.cat([rgb, depth], axis=1)
+        latent_space_image = self.vae.encode(image).latent_dist.sample(generator)
+        latent_space_image *= self.vae.scaling_factor
+        noise_level = torch.tensor([noise_level], dtype=torch.long, device=device)
+        # noise_rgb = randn_tensor(rgb.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
+        # rgb = self.low_res_scheduler.add_noise(rgb, noise_rgb, noise_level)
+        # noise_depth = randn_tensor(depth.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
+        # depth = self.low_res_scheduler.add_noise(depth, noise_depth, noise_level)
+
+        batch_multiplier = 2 if do_classifier_free_guidance else 1
+        latent_space_image = torch.cat([latent_space_image] * batch_multiplier * num_images_per_prompt)
+        noise_level = torch.cat([noise_level] * latent_space_image.shape[0])
+
+        # 7. Prepare latent variables
+        height, width = latent_space_image.shape[2:]
+        num_channels_latents = self.vae.config.latent_channels
+
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 8. Check that sizes of image and latents match
+        num_channels_image = latent_space_image.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = torch.cat([latent_model_input, latent_space_image], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=noise_level,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        rgb, depth = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # 11. Apply watermark
+        if output_type == "pil" and self.watermarker is not None:
+            rgb = self.watermarker.apply_watermark(rgb)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return ((rgb, depth), has_nsfw_concept)
+
+        return LDM3DPipelineOutput(rgb=rgb, depth=depth, nsfw_content_detected=has_nsfw_concept)
--- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
+++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
--- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
+++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
--- a/examples/community/regional_prompting_stable_diffusion.py
+++ b/examples/community/regional_prompting_stable_diffusion.py
@@ -0,0 +1,620 @@
+import math
+from typing import Dict, Optional
+
+import torch
+import torchvision.transforms.functional as FF
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from diffusers import StableDiffusionPipeline
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import USE_PEFT_BACKEND
+
+
+try:
+    from compel import Compel
+except ImportError:
+    Compel = None
+
+KCOMM = "ADDCOMM"
+KBRK = "BREAK"
+
+
+class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
+    r"""
+    Args for Regional Prompting Pipeline:
+        rp_args:dict
+        Required
+            rp_args["mode"]: cols, rows, prompt, prompt-ex
+        for cols, rows mode
+            rp_args["div"]: ex) 1;1;1(Divide into 3 regions)
+        for prompt, prompt-ex mode
+            rp_args["th"]: ex) 0.5,0.5,0.6 (threshold for prompt mode)
+
+        Optional
+            rp_args["save_mask"]: True/False (save masks in prompt mode)
+
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__(
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            scheduler,
+            safety_checker,
+            feature_extractor,
+            requires_safety_checker,
+        )
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: str,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: str = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        rp_args: Dict[str, str] = None,
+    ):
+        active = KBRK in prompt[0] if isinstance(prompt, list) else KBRK in prompt
+        if negative_prompt is None:
+            negative_prompt = "" if isinstance(prompt, str) else [""] * len(prompt)
+
+        device = self._execution_device
+        regions = 0
+
+        self.power = int(rp_args["power"]) if "power" in rp_args else 1
+
+        prompts = prompt if isinstance(prompt, list) else [prompt]
+        n_prompts = negative_prompt if isinstance(prompt, str) else [negative_prompt]
+        self.batch = batch = num_images_per_prompt * len(prompts)
+        all_prompts_cn, all_prompts_p = promptsmaker(prompts, num_images_per_prompt)
+        all_n_prompts_cn, _ = promptsmaker(n_prompts, num_images_per_prompt)
+
+        equal = len(all_prompts_cn) == len(all_n_prompts_cn)
+
+        if Compel:
+            compel = Compel(tokenizer=self.tokenizer, text_encoder=self.text_encoder)
+
+            def getcompelembs(prps):
+                embl = []
+                for prp in prps:
+                    embl.append(compel.build_conditioning_tensor(prp))
+                return torch.cat(embl)
+
+            conds = getcompelembs(all_prompts_cn)
+            unconds = getcompelembs(all_n_prompts_cn)
+            embs = getcompelembs(prompts)
+            n_embs = getcompelembs(n_prompts)
+            prompt = negative_prompt = None
+        else:
+            conds = self.encode_prompt(prompts, device, 1, True)[0]
+            unconds = (
+                self.encode_prompt(n_prompts, device, 1, True)[0]
+                if equal
+                else self.encode_prompt(all_n_prompts_cn, device, 1, True)[0]
+            )
+            embs = n_embs = None
+
+        if not active:
+            pcallback = None
+            mode = None
+        else:
+            if any(x in rp_args["mode"].upper() for x in ["COL", "ROW"]):
+                mode = "COL" if "COL" in rp_args["mode"].upper() else "ROW"
+                ocells, icells, regions = make_cells(rp_args["div"])
+
+            elif "PRO" in rp_args["mode"].upper():
+                regions = len(all_prompts_p[0])
+                mode = "PROMPT"
+                reset_attnmaps(self)
+                self.ex = "EX" in rp_args["mode"].upper()
+                self.target_tokens = target_tokens = tokendealer(self, all_prompts_p)
+                thresholds = [float(x) for x in rp_args["th"].split(",")]
+
+            orig_hw = (height, width)
+            revers = True
+
+            def pcallback(s_self, step: int, timestep: int, latents: torch.FloatTensor, selfs=None):
+                if "PRO" in mode:  # in Prompt mode, make masks from sum of attension maps
+                    self.step = step
+
+                    if len(self.attnmaps_sizes) > 3:
+                        self.history[step] = self.attnmaps.copy()
+                        for hw in self.attnmaps_sizes:
+                            allmasks = []
+                            basemasks = [None] * batch
+                            for tt, th in zip(target_tokens, thresholds):
+                                for b in range(batch):
+                                    key = f"{tt}-{b}"
+                                    _, mask, _ = makepmask(self, self.attnmaps[key], hw[0], hw[1], th, step)
+                                    mask = mask.unsqueeze(0).unsqueeze(-1)
+                                    if self.ex:
+                                        allmasks[b::batch] = [x - mask for x in allmasks[b::batch]]
+                                        allmasks[b::batch] = [torch.where(x > 0, 1, 0) for x in allmasks[b::batch]]
+                                    allmasks.append(mask)
+                                    basemasks[b] = mask if basemasks[b] is None else basemasks[b] + mask
+                            basemasks = [1 - mask for mask in basemasks]
+                            basemasks = [torch.where(x > 0, 1, 0) for x in basemasks]
+                            allmasks = basemasks + allmasks
+
+                            self.attnmasks[hw] = torch.cat(allmasks)
+                        self.maskready = True
+                return latents
+
+            def hook_forward(module):
+                # diffusers==0.23.2
+                def forward(
+                    hidden_states: torch.FloatTensor,
+                    encoder_hidden_states: Optional[torch.FloatTensor] = None,
+                    attention_mask: Optional[torch.FloatTensor] = None,
+                    temb: Optional[torch.FloatTensor] = None,
+                    scale: float = 1.0,
+                ) -> torch.Tensor:
+                    attn = module
+                    xshape = hidden_states.shape
+                    self.hw = (h, w) = split_dims(xshape[1], *orig_hw)
+
+                    if revers:
+                        nx, px = hidden_states.chunk(2)
+                    else:
+                        px, nx = hidden_states.chunk(2)
+
+                    if equal:
+                        hidden_states = torch.cat(
+                            [px for i in range(regions)] + [nx for i in range(regions)],
+                            0,
+                        )
+                        encoder_hidden_states = torch.cat([conds] + [unconds])
+                    else:
+                        hidden_states = torch.cat([px for i in range(regions)] + [nx], 0)
+                        encoder_hidden_states = torch.cat([conds] + [unconds])
+
+                    residual = hidden_states
+
+                    args = () if USE_PEFT_BACKEND else (scale,)
+
+                    if attn.spatial_norm is not None:
+                        hidden_states = attn.spatial_norm(hidden_states, temb)
+
+                    input_ndim = hidden_states.ndim
+
+                    if input_ndim == 4:
+                        batch_size, channel, height, width = hidden_states.shape
+                        hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+                    batch_size, sequence_length, _ = (
+                        hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+                    )
+
+                    if attention_mask is not None:
+                        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+                        attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+                    if attn.group_norm is not None:
+                        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+                    args = () if USE_PEFT_BACKEND else (scale,)
+                    query = attn.to_q(hidden_states, *args)
+
+                    if encoder_hidden_states is None:
+                        encoder_hidden_states = hidden_states
+                    elif attn.norm_cross:
+                        encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+                    key = attn.to_k(encoder_hidden_states, *args)
+                    value = attn.to_v(encoder_hidden_states, *args)
+
+                    inner_dim = key.shape[-1]
+                    head_dim = inner_dim // attn.heads
+
+                    query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+                    key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                    value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+                    # the output of sdp = (batch, num_heads, seq_len, head_dim)
+                    # TODO: add support for attn.scale when we move to Torch 2.1
+                    hidden_states = scaled_dot_product_attention(
+                        self,
+                        query,
+                        key,
+                        value,
+                        attn_mask=attention_mask,
+                        dropout_p=0.0,
+                        is_causal=False,
+                        getattn="PRO" in mode,
+                    )
+
+                    hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+                    hidden_states = hidden_states.to(query.dtype)
+
+                    # linear proj
+                    hidden_states = attn.to_out[0](hidden_states, *args)
+                    # dropout
+                    hidden_states = attn.to_out[1](hidden_states)
+
+                    if input_ndim == 4:
+                        hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+                    if attn.residual_connection:
+                        hidden_states = hidden_states + residual
+
+                    hidden_states = hidden_states / attn.rescale_output_factor
+
+                    #### Regional Prompting Col/Row mode
+                    if any(x in mode for x in ["COL", "ROW"]):
+                        reshaped = hidden_states.reshape(hidden_states.size()[0], h, w, hidden_states.size()[2])
+                        center = reshaped.shape[0] // 2
+                        px = reshaped[0:center] if equal else reshaped[0:-batch]
+                        nx = reshaped[center:] if equal else reshaped[-batch:]
+                        outs = [px, nx] if equal else [px]
+                        for out in outs:
+                            c = 0
+                            for i, ocell in enumerate(ocells):
+                                for icell in icells[i]:
+                                    if "ROW" in mode:
+                                        out[
+                                            0:batch,
+                                            int(h * ocell[0]) : int(h * ocell[1]),
+                                            int(w * icell[0]) : int(w * icell[1]),
+                                            :,
+                                        ] = out[
+                                            c * batch : (c + 1) * batch,
+                                            int(h * ocell[0]) : int(h * ocell[1]),
+                                            int(w * icell[0]) : int(w * icell[1]),
+                                            :,
+                                        ]
+                                    else:
+                                        out[
+                                            0:batch,
+                                            int(h * icell[0]) : int(h * icell[1]),
+                                            int(w * ocell[0]) : int(w * ocell[1]),
+                                            :,
+                                        ] = out[
+                                            c * batch : (c + 1) * batch,
+                                            int(h * icell[0]) : int(h * icell[1]),
+                                            int(w * ocell[0]) : int(w * ocell[1]),
+                                            :,
+                                        ]
+                                    c += 1
+                        px, nx = (px[0:batch], nx[0:batch]) if equal else (px[0:batch], nx)
+                        hidden_states = torch.cat([nx, px], 0) if revers else torch.cat([px, nx], 0)
+                        hidden_states = hidden_states.reshape(xshape)
+
+                    #### Regional Prompting Prompt mode
+                    elif "PRO" in mode:
+                        px, nx = (
+                            torch.chunk(hidden_states) if equal else hidden_states[0:-batch],
+                            hidden_states[-batch:],
+                        )
+
+                        if (h, w) in self.attnmasks and self.maskready:
+
+                            def mask(input):
+                                out = torch.multiply(input, self.attnmasks[(h, w)])
+                                for b in range(batch):
+                                    for r in range(1, regions):
+                                        out[b] = out[b] + out[r * batch + b]
+                                return out
+
+                            px, nx = (mask(px), mask(nx)) if equal else (mask(px), nx)
+                        px, nx = (px[0:batch], nx[0:batch]) if equal else (px[0:batch], nx)
+                        hidden_states = torch.cat([nx, px], 0) if revers else torch.cat([px, nx], 0)
+                    return hidden_states
+
+                return forward
+
+            def hook_forwards(root_module: torch.nn.Module):
+                for name, module in root_module.named_modules():
+                    if "attn2" in name and module.__class__.__name__ == "Attention":
+                        module.forward = hook_forward(module)
+
+            hook_forwards(self.unet)
+
+        output = StableDiffusionPipeline(**self.components)(
+            prompt=prompt,
+            prompt_embeds=embs,
+            negative_prompt=negative_prompt,
+            negative_prompt_embeds=n_embs,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback_on_step_end=pcallback,
+        )
+
+        if "save_mask" in rp_args:
+            save_mask = rp_args["save_mask"]
+        else:
+            save_mask = False
+
+        if mode == "PROMPT" and save_mask:
+            saveattnmaps(
+                self,
+                output,
+                height,
+                width,
+                thresholds,
+                num_inference_steps // 2,
+                regions,
+            )
+
+        return output
+
+
+### Make prompt list for each regions
+def promptsmaker(prompts, batch):
+    out_p = []
+    plen = len(prompts)
+    for prompt in prompts:
+        add = ""
+        if KCOMM in prompt:
+            add, prompt = prompt.split(KCOMM)
+            add = add + " "
+        prompts = prompt.split(KBRK)
+        out_p.append([add + p for p in prompts])
+    out = [None] * batch * len(out_p[0]) * len(out_p)
+    for p, prs in enumerate(out_p):  # inputs prompts
+        for r, pr in enumerate(prs):  # prompts for regions
+            start = (p + r * plen) * batch
+            out[start : start + batch] = [pr] * batch  # P1R1B1,P1R1B2...,P1R2B1,P1R2B2...,P2R1B1...
+    return out, out_p
+
+
+### make regions from ratios
+### ";" makes outercells, "," makes inner cells
+def make_cells(ratios):
+    if ";" not in ratios and "," in ratios:
+        ratios = ratios.replace(",", ";")
+    ratios = ratios.split(";")
+    ratios = [inratios.split(",") for inratios in ratios]
+
+    icells = []
+    ocells = []
+
+    def startend(cells, array):
+        current_start = 0
+        array = [float(x) for x in array]
+        for value in array:
+            end = current_start + (value / sum(array))
+            cells.append([current_start, end])
+            current_start = end
+
+    startend(ocells, [r[0] for r in ratios])
+
+    for inratios in ratios:
+        if 2 > len(inratios):
+            icells.append([[0, 1]])
+        else:
+            add = []
+            startend(add, inratios[1:])
+            icells.append(add)
+
+    return ocells, icells, sum(len(cell) for cell in icells)
+
+
+def make_emblist(self, prompts):
+    with torch.no_grad():
+        tokens = self.tokenizer(
+            prompts,
+            max_length=self.tokenizer.model_max_length,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+        ).input_ids.to(self.device)
+        embs = self.text_encoder(tokens, output_hidden_states=True).last_hidden_state.to(self.device, dtype=self.dtype)
+    return embs
+
+
+def split_dims(xs, height, width):
+    xs = xs
+
+    def repeat_div(x, y):
+        while y > 0:
+            x = math.ceil(x / 2)
+            y = y - 1
+        return x
+
+    scale = math.ceil(math.log2(math.sqrt(height * width / xs)))
+    dsh = repeat_div(height, scale)
+    dsw = repeat_div(width, scale)
+    return dsh, dsw
+
+
+##### for prompt mode
+def get_attn_maps(self, attn):
+    height, width = self.hw
+    target_tokens = self.target_tokens
+    if (height, width) not in self.attnmaps_sizes:
+        self.attnmaps_sizes.append((height, width))
+
+    for b in range(self.batch):
+        for t in target_tokens:
+            power = self.power
+            add = attn[b, :, :, t[0] : t[0] + len(t)] ** (power) * (self.attnmaps_sizes.index((height, width)) + 1)
+            add = torch.sum(add, dim=2)
+            key = f"{t}-{b}"
+            if key not in self.attnmaps:
+                self.attnmaps[key] = add
+            else:
+                if self.attnmaps[key].shape[1] != add.shape[1]:
+                    add = add.view(8, height, width)
+                    add = FF.resize(add, self.attnmaps_sizes[0], antialias=None)
+                    add = add.reshape_as(self.attnmaps[key])
+
+                self.attnmaps[key] = self.attnmaps[key] + add
+
+
+def reset_attnmaps(self):  # init parameters in every batch
+    self.step = 0
+    self.attnmaps = {}  # maked from attention maps
+    self.attnmaps_sizes = []  # height,width set of u-net blocks
+    self.attnmasks = {}  # maked from attnmaps for regions
+    self.maskready = False
+    self.history = {}
+
+
+def saveattnmaps(self, output, h, w, th, step, regions):
+    masks = []
+    for i, mask in enumerate(self.history[step].values()):
+        img, _, mask = makepmask(self, mask, h, w, th[i % len(th)], step)
+        if self.ex:
+            masks = [x - mask for x in masks]
+            masks.append(mask)
+            if len(masks) == regions - 1:
+                output.images.extend([FF.to_pil_image(mask) for mask in masks])
+                masks = []
+        else:
+            output.images.append(img)
+
+
+def makepmask(
+    self, mask, h, w, th, step
+):  # make masks from attention cache return [for preview, for attention, for Latent]
+    th = th - step * 0.005
+    if 0.05 >= th:
+        th = 0.05
+    mask = torch.mean(mask, dim=0)
+    mask = mask / mask.max().item()
+    mask = torch.where(mask > th, 1, 0)
+    mask = mask.float()
+    mask = mask.view(1, *self.attnmaps_sizes[0])
+    img = FF.to_pil_image(mask)
+    img = img.resize((w, h))
+    mask = FF.resize(mask, (h, w), interpolation=FF.InterpolationMode.NEAREST, antialias=None)
+    lmask = mask
+    mask = mask.reshape(h * w)
+    mask = torch.where(mask > 0.1, 1, 0)
+    return img, mask, lmask
+
+
+def tokendealer(self, all_prompts):
+    for prompts in all_prompts:
+        targets = [p.split(",")[-1] for p in prompts[1:]]
+        tt = []
+
+        for target in targets:
+            ptokens = (
+                self.tokenizer(
+                    prompts,
+                    max_length=self.tokenizer.model_max_length,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                ).input_ids
+            )[0]
+            ttokens = (
+                self.tokenizer(
+                    target,
+                    max_length=self.tokenizer.model_max_length,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                ).input_ids
+            )[0]
+
+            tlist = []
+
+            for t in range(ttokens.shape[0] - 2):
+                for p in range(ptokens.shape[0]):
+                    if ttokens[t + 1] == ptokens[p]:
+                        tlist.append(p)
+            if tlist != []:
+                tt.append(tlist)
+
+    return tt
+
+
+def scaled_dot_product_attention(
+    self,
+    query,
+    key,
+    value,
+    attn_mask=None,
+    dropout_p=0.0,
+    is_causal=False,
+    scale=None,
+    getattn=False,
+) -> torch.Tensor:
+    # Efficient implementation equivalent to the following:
+    L, S = query.size(-2), key.size(-2)
+    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+    attn_bias = torch.zeros(L, S, dtype=query.dtype, device=self.device)
+    if is_causal:
+        assert attn_mask is None
+        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        attn_bias.to(query.dtype)
+
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
+        else:
+            attn_bias += attn_mask
+    attn_weight = query @ key.transpose(-2, -1) * scale_factor
+    attn_weight += attn_bias
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    if getattn:
+        get_attn_maps(self, attn_weight)
+    attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+    return attn_weight @ value
--- a/examples/community/sde_drag.py
+++ b/examples/community/sde_drag.py
@@ -0,0 +1,594 @@
+import math
+import tempfile
+from typing import List, Optional
+
+import numpy as np
+import PIL.Image
+import torch
+from accelerate import Accelerator
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DiffusionPipeline, DPMSolverMultistepScheduler, UNet2DConditionModel
+from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin
+from diffusers.models.attention_processor import (
+    AttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    LoRAAttnAddedKVProcessor,
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    SlicedAttnAddedKVProcessor,
+)
+from diffusers.optimization import get_scheduler
+
+
+class SdeDragPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image drag-and-drop editing using stochastic differential equations: https://arxiv.org/abs/2311.01410.
+    Please refer to the [official repository](https://github.com/ML-GSAI/SDE-Drag) for more information.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Please use
+            [`DDIMScheduler`].
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DPMSolverMultistepScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: str,
+        image: PIL.Image.Image,
+        mask_image: PIL.Image.Image,
+        source_points: List[List[int]],
+        target_points: List[List[int]],
+        t0: Optional[float] = 0.6,
+        steps: Optional[int] = 200,
+        step_size: Optional[int] = 2,
+        image_scale: Optional[float] = 0.3,
+        adapt_radius: Optional[int] = 5,
+        min_lora_scale: Optional[float] = 0.5,
+        generator: Optional[torch.Generator] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for image editing.
+        Args:
+            prompt (`str`, *required*):
+                The prompt to guide the image editing.
+            image (`PIL.Image.Image`, *required*):
+                Which will be edited, parts of the image will be masked out with `mask_image` and edited
+                according to `prompt`.
+            mask_image (`PIL.Image.Image`, *required*):
+                To mask `image`. White pixels in the mask will be edited, while black pixels will be preserved.
+            source_points (`List[List[int]]`, *required*):
+                Used to mark the starting positions of drag editing in the image, with each pixel represented as a
+                `List[int]` of length 2.
+            target_points (`List[List[int]]`, *required*):
+                Used to mark the target positions of drag editing in the image, with each pixel represented as a
+                `List[int]` of length 2.
+            t0 (`float`, *optional*, defaults to 0.6):
+                The time parameter. Higher t0 improves the fidelity while lowering the faithfulness of the edited images
+                and vice versa.
+            steps (`int`, *optional*, defaults to 200):
+                The number of sampling iterations.
+            step_size (`int`, *optional*, defaults to 2):
+                The drag diatance of each drag step.
+            image_scale (`float`, *optional*, defaults to 0.3):
+                To avoid duplicating the content, use image_scale to perturbs the source.
+            adapt_radius (`int`, *optional*, defaults to 5):
+                The size of the region for copy and paste operations during each step of the drag process.
+            min_lora_scale (`float`, *optional*, defaults to 0.5):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+                min_lora_scale specifies the minimum LoRA scale during the image drag-editing process.
+            generator ('torch.Generator', *optional*, defaults to None):
+                To make generation deterministic(https://pytorch.org/docs/stable/generated/torch.Generator.html).
+        Examples:
+        ```py
+        >>> import PIL
+        >>> import torch
+        >>> from diffusers import DDIMScheduler, DiffusionPipeline
+
+        >>> # Load the pipeline
+        >>> model_path = "runwayml/stable-diffusion-v1-5"
+        >>> scheduler = DDIMScheduler.from_pretrained(model_path, subfolder="scheduler")
+        >>> pipe = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, custom_pipeline="sde_drag")
+        >>> pipe.to('cuda')
+
+        >>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+        >>> # If not training LoRA, please avoid using torch.float16
+        >>> # pipe.to(torch.float16)
+
+        >>> # Provide prompt, image, mask image, and the starting and target points for drag editing.
+        >>> prompt = "prompt of the image"
+        >>> image = PIL.Image.open('/path/to/image')
+        >>> mask_image = PIL.Image.open('/path/to/mask_image')
+        >>> source_points = [[123, 456]]
+        >>> target_points = [[234, 567]]
+
+        >>> # train_lora is optional, and in most cases, using train_lora can better preserve consistency with the original image.
+        >>> pipe.train_lora(prompt, image)
+
+        >>> output = pipe(prompt, image, mask_image, source_points, target_points)
+        >>> output_image = PIL.Image.fromarray(output)
+        >>> output_image.save("./output.png")
+        ```
+        """
+
+        self.scheduler.set_timesteps(steps)
+
+        noise_scale = (1 - image_scale**2) ** (0.5)
+
+        text_embeddings = self._get_text_embed(prompt)
+        uncond_embeddings = self._get_text_embed([""])
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        latent = self._get_img_latent(image)
+
+        mask = mask_image.resize((latent.shape[3], latent.shape[2]))
+        mask = torch.tensor(np.array(mask))
+        mask = mask.unsqueeze(0).expand_as(latent).to(self.device)
+
+        source_points = torch.tensor(source_points).div(torch.tensor([8]), rounding_mode="trunc")
+        target_points = torch.tensor(target_points).div(torch.tensor([8]), rounding_mode="trunc")
+
+        distance = target_points - source_points
+        distance_norm_max = torch.norm(distance.float(), dim=1, keepdim=True).max()
+
+        if distance_norm_max <= step_size:
+            drag_num = 1
+        else:
+            drag_num = distance_norm_max.div(torch.tensor([step_size]), rounding_mode="trunc")
+            if (distance_norm_max / drag_num - step_size).abs() > (
+                distance_norm_max / (drag_num + 1) - step_size
+            ).abs():
+                drag_num += 1
+
+        latents = []
+        for i in tqdm(range(int(drag_num)), desc="SDE Drag"):
+            source_new = source_points + (i / drag_num * distance).to(torch.int)
+            target_new = source_points + ((i + 1) / drag_num * distance).to(torch.int)
+
+            latent, noises, hook_latents, lora_scales, cfg_scales = self._forward(
+                latent, steps, t0, min_lora_scale, text_embeddings, generator
+            )
+            latent = self._copy_and_paste(
+                latent,
+                source_new,
+                target_new,
+                adapt_radius,
+                latent.shape[2] - 1,
+                latent.shape[3] - 1,
+                image_scale,
+                noise_scale,
+                generator,
+            )
+            latent = self._backward(
+                latent, mask, steps, t0, noises, hook_latents, lora_scales, cfg_scales, text_embeddings, generator
+            )
+
+            latents.append(latent)
+
+        result_image = 1 / 0.18215 * latents[-1]
+
+        with torch.no_grad():
+            result_image = self.vae.decode(result_image).sample
+
+        result_image = (result_image / 2 + 0.5).clamp(0, 1)
+        result_image = result_image.cpu().permute(0, 2, 3, 1).numpy()[0]
+        result_image = (result_image * 255).astype(np.uint8)
+
+        return result_image
+
+    def train_lora(self, prompt, image, lora_step=100, lora_rank=16, generator=None):
+        accelerator = Accelerator(gradient_accumulation_steps=1, mixed_precision="fp16")
+
+        self.vae.requires_grad_(False)
+        self.text_encoder.requires_grad_(False)
+        self.unet.requires_grad_(False)
+
+        unet_lora_attn_procs = {}
+        for name, attn_processor in self.unet.attn_processors.items():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.unet.config.block_out_channels[block_id]
+            else:
+                raise NotImplementedError("name must start with up_blocks, mid_blocks, or down_blocks")
+
+            if isinstance(attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0)):
+                lora_attn_processor_class = LoRAAttnAddedKVProcessor
+            else:
+                lora_attn_processor_class = (
+                    LoRAAttnProcessor2_0
+                    if hasattr(torch.nn.functional, "scaled_dot_product_attention")
+                    else LoRAAttnProcessor
+                )
+            unet_lora_attn_procs[name] = lora_attn_processor_class(
+                hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=lora_rank
+            )
+
+        self.unet.set_attn_processor(unet_lora_attn_procs)
+        unet_lora_layers = AttnProcsLayers(self.unet.attn_processors)
+        params_to_optimize = unet_lora_layers.parameters()
+
+        optimizer = torch.optim.AdamW(
+            params_to_optimize,
+            lr=2e-4,
+            betas=(0.9, 0.999),
+            weight_decay=1e-2,
+            eps=1e-08,
+        )
+
+        lr_scheduler = get_scheduler(
+            "constant",
+            optimizer=optimizer,
+            num_warmup_steps=0,
+            num_training_steps=lora_step,
+            num_cycles=1,
+            power=1.0,
+        )
+
+        unet_lora_layers = accelerator.prepare_model(unet_lora_layers)
+        optimizer = accelerator.prepare_optimizer(optimizer)
+        lr_scheduler = accelerator.prepare_scheduler(lr_scheduler)
+
+        with torch.no_grad():
+            text_inputs = self._tokenize_prompt(prompt, tokenizer_max_length=None)
+            text_embedding = self._encode_prompt(
+                text_inputs.input_ids, text_inputs.attention_mask, text_encoder_use_attention_mask=False
+            )
+
+        image_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+        image = image_transforms(image).to(self.device, dtype=self.vae.dtype)
+        image = image.unsqueeze(dim=0)
+        latents_dist = self.vae.encode(image).latent_dist
+
+        for _ in tqdm(range(lora_step), desc="Train LoRA"):
+            self.unet.train()
+            model_input = latents_dist.sample() * self.vae.config.scaling_factor
+
+            # Sample noise that we'll add to the latents
+            noise = torch.randn(
+                model_input.size(),
+                dtype=model_input.dtype,
+                layout=model_input.layout,
+                device=model_input.device,
+                generator=generator,
+            )
+            bsz, channels, height, width = model_input.shape
+
+            # Sample a random timestep for each image
+            timesteps = torch.randint(
+                0, self.scheduler.config.num_train_timesteps, (bsz,), device=model_input.device, generator=generator
+            )
+            timesteps = timesteps.long()
+
+            # Add noise to the model input according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_model_input = self.scheduler.add_noise(model_input, noise, timesteps)
+
+            # Predict the noise residual
+            model_pred = self.unet(noisy_model_input, timesteps, text_embedding).sample
+
+            # Get the target for loss depending on the prediction type
+            if self.scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif self.scheduler.config.prediction_type == "v_prediction":
+                target = self.scheduler.get_velocity(model_input, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {self.scheduler.config.prediction_type}")
+
+            loss = torch.nn.functional.mse_loss(model_pred.float(), target.float(), reduction="mean")
+            accelerator.backward(loss)
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+
+        with tempfile.TemporaryDirectory() as save_lora_dir:
+            LoraLoaderMixin.save_lora_weights(
+                save_directory=save_lora_dir,
+                unet_lora_layers=unet_lora_layers,
+                text_encoder_lora_layers=None,
+            )
+
+            self.unet.load_attn_procs(save_lora_dir)
+
+    def _tokenize_prompt(self, prompt, tokenizer_max_length=None):
+        if tokenizer_max_length is not None:
+            max_length = tokenizer_max_length
+        else:
+            max_length = self.tokenizer.model_max_length
+
+        text_inputs = self.tokenizer(
+            prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=max_length,
+            return_tensors="pt",
+        )
+
+        return text_inputs
+
+    def _encode_prompt(self, input_ids, attention_mask, text_encoder_use_attention_mask=False):
+        text_input_ids = input_ids.to(self.device)
+
+        if text_encoder_use_attention_mask:
+            attention_mask = attention_mask.to(self.device)
+        else:
+            attention_mask = None
+
+        prompt_embeds = self.text_encoder(
+            text_input_ids,
+            attention_mask=attention_mask,
+        )
+        prompt_embeds = prompt_embeds[0]
+
+        return prompt_embeds
+
+    @torch.no_grad()
+    def _get_text_embed(self, prompt):
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        return text_embeddings
+
+    def _copy_and_paste(
+        self, latent, source_new, target_new, adapt_radius, max_height, max_width, image_scale, noise_scale, generator
+    ):
+        def adaption_r(source, target, adapt_radius, max_height, max_width):
+            r_x_lower = min(adapt_radius, source[0], target[0])
+            r_x_upper = min(adapt_radius, max_width - source[0], max_width - target[0])
+            r_y_lower = min(adapt_radius, source[1], target[1])
+            r_y_upper = min(adapt_radius, max_height - source[1], max_height - target[1])
+            return r_x_lower, r_x_upper, r_y_lower, r_y_upper
+
+        for source_, target_ in zip(source_new, target_new):
+            r_x_lower, r_x_upper, r_y_lower, r_y_upper = adaption_r(
+                source_, target_, adapt_radius, max_height, max_width
+            )
+
+            source_feature = latent[
+                :, :, source_[1] - r_y_lower : source_[1] + r_y_upper, source_[0] - r_x_lower : source_[0] + r_x_upper
+            ].clone()
+
+            latent[
+                :, :, source_[1] - r_y_lower : source_[1] + r_y_upper, source_[0] - r_x_lower : source_[0] + r_x_upper
+            ] = image_scale * source_feature + noise_scale * torch.randn(
+                latent.shape[0],
+                4,
+                r_y_lower + r_y_upper,
+                r_x_lower + r_x_upper,
+                device=self.device,
+                generator=generator,
+            )
+
+            latent[
+                :, :, target_[1] - r_y_lower : target_[1] + r_y_upper, target_[0] - r_x_lower : target_[0] + r_x_upper
+            ] = source_feature * 1.1
+        return latent
+
+    @torch.no_grad()
+    def _get_img_latent(self, image, height=None, weight=None):
+        data = image.convert("RGB")
+        if height is not None:
+            data = data.resize((weight, height))
+        transform = transforms.ToTensor()
+        data = transform(data).unsqueeze(0)
+        data = (data * 2.0) - 1.0
+        data = data.to(self.device, dtype=self.vae.dtype)
+        latent = self.vae.encode(data).latent_dist.sample()
+        latent = 0.18215 * latent
+        return latent
+
+    @torch.no_grad()
+    def _get_eps(self, latent, timestep, guidance_scale, text_embeddings, lora_scale=None):
+        latent_model_input = torch.cat([latent] * 2) if guidance_scale > 1.0 else latent
+        text_embeddings = text_embeddings if guidance_scale > 1.0 else text_embeddings.chunk(2)[1]
+
+        cross_attention_kwargs = None if lora_scale is None else {"scale": lora_scale}
+
+        with torch.no_grad():
+            noise_pred = self.unet(
+                latent_model_input,
+                timestep,
+                encoder_hidden_states=text_embeddings,
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
+
+        if guidance_scale > 1.0:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        elif guidance_scale == 1.0:
+            noise_pred_text = noise_pred
+            noise_pred_uncond = 0.0
+        else:
+            raise NotImplementedError(guidance_scale)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+        return noise_pred
+
+    def _forward_sde(
+        self, timestep, sample, guidance_scale, text_embeddings, steps, eta=1.0, lora_scale=None, generator=None
+    ):
+        num_train_timesteps = len(self.scheduler)
+        alphas_cumprod = self.scheduler.alphas_cumprod
+        initial_alpha_cumprod = torch.tensor(1.0)
+
+        prev_timestep = timestep + num_train_timesteps // steps
+
+        alpha_prod_t = alphas_cumprod[timestep] if timestep >= 0 else initial_alpha_cumprod
+        alpha_prod_t_prev = alphas_cumprod[prev_timestep]
+
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        x_prev = (alpha_prod_t_prev / alpha_prod_t) ** (0.5) * sample + (1 - alpha_prod_t_prev / alpha_prod_t) ** (
+            0.5
+        ) * torch.randn(
+            sample.size(), dtype=sample.dtype, layout=sample.layout, device=self.device, generator=generator
+        )
+        eps = self._get_eps(x_prev, prev_timestep, guidance_scale, text_embeddings, lora_scale)
+
+        sigma_t_prev = (
+            eta
+            * (1 - alpha_prod_t) ** (0.5)
+            * (1 - alpha_prod_t_prev / (1 - alpha_prod_t_prev) * (1 - alpha_prod_t) / alpha_prod_t) ** (0.5)
+        )
+
+        pred_original_sample = (x_prev - beta_prod_t_prev ** (0.5) * eps) / alpha_prod_t_prev ** (0.5)
+        pred_sample_direction_coeff = (1 - alpha_prod_t - sigma_t_prev**2) ** (0.5)
+
+        noise = (
+            sample - alpha_prod_t ** (0.5) * pred_original_sample - pred_sample_direction_coeff * eps
+        ) / sigma_t_prev
+
+        return x_prev, noise
+
+    def _sample(
+        self,
+        timestep,
+        sample,
+        guidance_scale,
+        text_embeddings,
+        steps,
+        sde=False,
+        noise=None,
+        eta=1.0,
+        lora_scale=None,
+        generator=None,
+    ):
+        num_train_timesteps = len(self.scheduler)
+        alphas_cumprod = self.scheduler.alphas_cumprod
+        final_alpha_cumprod = torch.tensor(1.0)
+
+        eps = self._get_eps(sample, timestep, guidance_scale, text_embeddings, lora_scale)
+
+        prev_timestep = timestep - num_train_timesteps // steps
+
+        alpha_prod_t = alphas_cumprod[timestep]
+        alpha_prod_t_prev = alphas_cumprod[prev_timestep] if prev_timestep >= 0 else final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        sigma_t = (
+            eta
+            * ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** (0.5)
+            * (1 - alpha_prod_t / alpha_prod_t_prev) ** (0.5)
+            if sde
+            else 0
+        )
+
+        pred_original_sample = (sample - beta_prod_t ** (0.5) * eps) / alpha_prod_t ** (0.5)
+        pred_sample_direction_coeff = (1 - alpha_prod_t_prev - sigma_t**2) ** (0.5)
+
+        noise = (
+            torch.randn(
+                sample.size(), dtype=sample.dtype, layout=sample.layout, device=self.device, generator=generator
+            )
+            if noise is None
+            else noise
+        )
+        latent = (
+            alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction_coeff * eps + sigma_t * noise
+        )
+
+        return latent
+
+    def _forward(self, latent, steps, t0, lora_scale_min, text_embeddings, generator):
+        def scale_schedule(begin, end, n, length, type="linear"):
+            if type == "constant":
+                return end
+            elif type == "linear":
+                return begin + (end - begin) * n / length
+            elif type == "cos":
+                factor = (1 - math.cos(n * math.pi / length)) / 2
+                return (1 - factor) * begin + factor * end
+            else:
+                raise NotImplementedError(type)
+
+        noises = []
+        latents = []
+        lora_scales = []
+        cfg_scales = []
+        latents.append(latent)
+        t0 = int(t0 * steps)
+        t_begin = steps - t0
+
+        length = len(self.scheduler.timesteps[t_begin - 1 : -1]) - 1
+        index = 1
+        for t in self.scheduler.timesteps[t_begin:].flip(dims=[0]):
+            lora_scale = scale_schedule(1, lora_scale_min, index, length, type="cos")
+            cfg_scale = scale_schedule(1, 3.0, index, length, type="linear")
+            latent, noise = self._forward_sde(
+                t, latent, cfg_scale, text_embeddings, steps, lora_scale=lora_scale, generator=generator
+            )
+
+            noises.append(noise)
+            latents.append(latent)
+            lora_scales.append(lora_scale)
+            cfg_scales.append(cfg_scale)
+            index += 1
+        return latent, noises, latents, lora_scales, cfg_scales
+
+    def _backward(
+        self, latent, mask, steps, t0, noises, hook_latents, lora_scales, cfg_scales, text_embeddings, generator
+    ):
+        t0 = int(t0 * steps)
+        t_begin = steps - t0
+
+        hook_latent = hook_latents.pop()
+        latent = torch.where(mask > 128, latent, hook_latent)
+        for t in self.scheduler.timesteps[t_begin - 1 : -1]:
+            latent = self._sample(
+                t,
+                latent,
+                cfg_scales.pop(),
+                text_embeddings,
+                steps,
+                sde=True,
+                noise=noises.pop(),
+                lora_scale=lora_scales.pop(),
+                generator=generator,
+            )
+            hook_latent = hook_latents.pop()
+            latent = torch.where(mask > 128, latent, hook_latent)
+        return latent
--- a/examples/community/stable_diffusion_ipex.py
+++ b/examples/community/stable_diffusion_ipex.py
@@ -21,7 +21,7 @@ from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

 from diffusers.configuration_utils import FrozenDict
-from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
@@ -62,7 +62,7 @@ EXAMPLE_DOC_STRING = """
 """


-class StableDiffusionIPEXPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
+class StableDiffusionIPEXPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion on IPEX.

--- a/examples/community/stable_diffusion_tensorrt_img2img.py
+++ b/examples/community/stable_diffusion_tensorrt_img2img.py
@@ -28,6 +28,7 @@ import PIL.Image
 import tensorrt as trt
 import torch
 from huggingface_hub import snapshot_download
+from huggingface_hub.utils import validate_hf_hub_args
 from onnx import shape_inference
 from polygraphy import cuda
 from polygraphy.backend.common import bytes_from_path
@@ -41,7 +42,7 @@ from polygraphy.backend.trt import (
    save_engine,
 )
 from polygraphy.backend.trt import util as trt_util
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import (
@@ -49,8 +50,9 @@ from diffusers.pipelines.stable_diffusion import (
    StableDiffusionPipelineOutput,
    StableDiffusionSafetyChecker,
 )
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_latents
 from diffusers.schedulers import DDIMScheduler
-from diffusers.utils import DIFFUSERS_CACHE, logging
+from diffusers.utils import logging


 """
@@ -607,7 +609,7 @@ class TorchVAEEncoder(torch.nn.Module):
        self.vae_encoder = model

    def forward(self, x):
-        return self.vae_encoder.encode(x).latent_dist.sample()
+        return retrieve_latents(self.vae_encoder.encode(x))


 class VAEEncoder(BaseModel):
@@ -709,6 +711,7 @@ class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline):
        scheduler: DDIMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
+        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
        stages=["clip", "unet", "vae", "vae_encoder"],
        image_height: int = 512,
@@ -724,7 +727,15 @@ class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline):
        timing_cache: str = "timing_cache",
    ):
        super().__init__(
-            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            requires_safety_checker=requires_safety_checker,
        )

        self.vae.forward = self.vae.decode
@@ -769,12 +780,13 @@ class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline):
            self.models["vae_encoder"] = make_VAEEncoder(self.vae, **models_args)

    @classmethod
+    @validate_hf_hub_args
    def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        cache_dir = kwargs.pop("cache_dir", None)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
-        use_auth_token = kwargs.pop("use_auth_token", None)
+        token = kwargs.pop("token", None)
        revision = kwargs.pop("revision", None)

        cls.cached_folder = (
@@ -786,7 +798,7 @@ class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline):
                resume_download=resume_download,
                proxies=proxies,
                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                revision=revision,
            )
        )
@@ -993,7 +1005,7 @@ class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline):
        """
        self.generator = generator
        self.denoising_steps = num_inference_steps
-        self.guidance_scale = guidance_scale
+        self._guidance_scale = guidance_scale

        # Pre-compute latent input scales and linear multistep coefficients
        self.scheduler.set_timesteps(self.denoising_steps, device=self.torch_device)
--- a/examples/community/stable_diffusion_tensorrt_inpaint.py
+++ b/examples/community/stable_diffusion_tensorrt_inpaint.py
@@ -28,6 +28,7 @@ import PIL.Image
 import tensorrt as trt
 import torch
 from huggingface_hub import snapshot_download
+from huggingface_hub.utils import validate_hf_hub_args
 from onnx import shape_inference
 from polygraphy import cuda
 from polygraphy.backend.common import bytes_from_path
@@ -41,7 +42,7 @@ from polygraphy.backend.trt import (
    save_engine,
 )
 from polygraphy.backend.trt import util as trt_util
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import (
@@ -51,7 +52,7 @@ from diffusers.pipelines.stable_diffusion import (
 )
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
 from diffusers.schedulers import DDIMScheduler
-from diffusers.utils import DIFFUSERS_CACHE, logging
+from diffusers.utils import logging


 """
@@ -710,6 +711,7 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        scheduler: DDIMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
+        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
        stages=["clip", "unet", "vae", "vae_encoder"],
        image_height: int = 512,
@@ -725,7 +727,15 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        timing_cache: str = "timing_cache",
    ):
        super().__init__(
-            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            requires_safety_checker=requires_safety_checker,
        )

        self.vae.forward = self.vae.decode
@@ -770,12 +780,13 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
            self.models["vae_encoder"] = make_VAEEncoder(self.vae, **models_args)

    @classmethod
+    @validate_hf_hub_args
    def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        cache_dir = kwargs.pop("cache_dir", None)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
-        use_auth_token = kwargs.pop("use_auth_token", None)
+        token = kwargs.pop("token", None)
        revision = kwargs.pop("revision", None)

        cls.cached_folder = (
@@ -787,7 +798,7 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
                resume_download=resume_download,
                proxies=proxies,
                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                revision=revision,
            )
        )
--- a/examples/community/stable_diffusion_tensorrt_txt2img.py
+++ b/examples/community/stable_diffusion_tensorrt_txt2img.py
@@ -27,6 +27,7 @@ import onnx_graphsurgeon as gs
 import tensorrt as trt
 import torch
 from huggingface_hub import snapshot_download
+from huggingface_hub.utils import validate_hf_hub_args
 from onnx import shape_inference
 from polygraphy import cuda
 from polygraphy.backend.common import bytes_from_path
@@ -40,7 +41,7 @@ from polygraphy.backend.trt import (
    save_engine,
 )
 from polygraphy.backend.trt import util as trt_util
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import (
@@ -49,7 +50,7 @@ from diffusers.pipelines.stable_diffusion import (
    StableDiffusionSafetyChecker,
 )
 from diffusers.schedulers import DDIMScheduler
-from diffusers.utils import DIFFUSERS_CACHE, logging
+from diffusers.utils import logging


 """
@@ -624,6 +625,7 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
        scheduler: DDIMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
+        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
        stages=["clip", "unet", "vae"],
        image_height: int = 768,
@@ -639,7 +641,15 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
        timing_cache: str = "timing_cache",
    ):
        super().__init__(
-            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            requires_safety_checker=requires_safety_checker,
        )

        self.vae.forward = self.vae.decode
@@ -682,12 +692,13 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
            self.models["vae"] = make_VAE(self.vae, **models_args)

    @classmethod
+    @validate_hf_hub_args
    def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        cache_dir = kwargs.pop("cache_dir", None)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
-        use_auth_token = kwargs.pop("use_auth_token", None)
+        token = kwargs.pop("token", None)
        revision = kwargs.pop("revision", None)

        cls.cached_folder = (
@@ -699,7 +710,7 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
                resume_download=resume_download,
                proxies=proxies,
                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                revision=revision,
            )
        )
--- a/examples/consistency_distillation/README.md
+++ b/examples/consistency_distillation/README.md
@@ -1,6 +1,6 @@
 # Latent Consistency Distillation Example:

-[Latent Consistency Models (LCMs)](https://arxiv.org/abs/2310.04378) is method to distill latent diffusion model to enable swift inference with minimal steps. This example demonstrates how to use the latent consistency distillation to distill stable-diffusion-v1.5 for less timestep inference.
+[Latent Consistency Models (LCMs)](https://arxiv.org/abs/2310.04378) is a method to distill a latent diffusion model to enable swift inference with minimal steps. This example demonstrates how to use latent consistency distillation to distill stable-diffusion-v1.5 for inference with few timesteps.

 ## Full model distillation

@@ -24,7 +24,7 @@ Then cd in the example folder and run
 pip install -r requirements.txt
 ```

-And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+And initialize an [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:

 ```bash
 accelerate config
@@ -46,12 +46,16 @@ write_basic_config()
 When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.


-#### Example with LAION-A6+ dataset
+#### Example
+
+The following uses the [Conceptual Captions 12M (CC12M) dataset](https://github.com/google-research-datasets/conceptual-12m) as an example, and for illustrative purposes only. For best results you may consider large and high-quality text-image datasets such as [LAION](https://laion.ai/blog/laion-400-open-dataset/). You may also need to search the hyperparameter space according to the dataset you use.

 ```bash
-runwayml/stable-diffusion-v1-5
-PROGRAM="train_lcm_distill_sd_wds.py \
-    --pretrained_teacher_model=$MODEL_DIR \
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export OUTPUT_DIR="path/to/saved/model"
+
+accelerate launch train_lcm_distill_sd_wds.py \
+    --pretrained_teacher_model=$MODEL_NAME \
    --output_dir=$OUTPUT_DIR \
    --mixed_precision=fp16 \
    --resolution=512 \
@@ -59,7 +63,7 @@ PROGRAM="train_lcm_distill_sd_wds.py \
    --max_train_steps=1000 \
    --max_train_samples=4000000 \
    --dataloader_num_workers=8 \
-    --train_shards_path_or_url='pipe:aws s3 cp s3://muse-datasets/laion-aesthetic6plus-min512-data/{00000..01210}.tar -' \
+    --train_shards_path_or_url="pipe:curl -L -s https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset/resolve/main/data/{00000..01099}.tar?download=true" \
    --validation_steps=200 \
    --checkpointing_steps=200 --checkpoints_total_limit=10 \
    --train_batch_size=12 \
@@ -69,28 +73,32 @@ PROGRAM="train_lcm_distill_sd_wds.py \
    --resume_from_checkpoint=latest \
    --report_to=wandb \
    --seed=453645634 \
-    --push_to_hub \
+    --push_to_hub
 ```

 ## LCM-LoRA

 Instead of fine-tuning the full model, we can also just train a LoRA that can be injected into any SDXL model.

-### Example with LAION-A6+ dataset
-    
+### Example
+
+The following uses the [Conceptual Captions 12M (CC12M) dataset](https://github.com/google-research-datasets/conceptual-12m) as an example. For best results you may consider large and high-quality text-image datasets such as [LAION](https://laion.ai/blog/laion-400-open-dataset/).
+
 ```bash
-runwayml/stable-diffusion-v1-5
-PROGRAM="train_lcm_distill_lora_sd_wds.py \
-    --pretrained_teacher_model=$MODEL_DIR \
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export OUTPUT_DIR="path/to/saved/model"
+
+accelerate launch train_lcm_distill_lora_sd_wds.py \
+    --pretrained_teacher_model=$MODEL_NAME \
    --output_dir=$OUTPUT_DIR \
    --mixed_precision=fp16 \
    --resolution=512 \
    --lora_rank=64 \
-    --learning_rate=1e-6 --loss_type="huber" --adam_weight_decay=0.0 \
+    --learning_rate=1e-4 --loss_type="huber" --adam_weight_decay=0.0 \
    --max_train_steps=1000 \
    --max_train_samples=4000000 \
    --dataloader_num_workers=8 \
-    --train_shards_path_or_url='pipe:aws s3 cp s3://muse-datasets/laion-aesthetic6plus-min512-data/{00000..01210}.tar -' \
+    --train_shards_path_or_url="pipe:curl -L -s https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset/resolve/main/data/{00000..01099}.tar?download=true" \
    --validation_steps=200 \
    --checkpointing_steps=200 --checkpoints_total_limit=10 \
    --train_batch_size=12 \
--- a/examples/consistency_distillation/README_sdxl.md
+++ b/examples/consistency_distillation/README_sdxl.md
@@ -1,6 +1,6 @@
 # Latent Consistency Distillation Example:

-[Latent Consistency Models (LCMs)](https://arxiv.org/abs/2310.04378) is method to distill latent diffusion model to enable swift inference with minimal steps. This example demonstrates how to use the latent consistency distillation to distill SDXL for less timestep inference.
+[Latent Consistency Models (LCMs)](https://arxiv.org/abs/2310.04378) is a method to distill a latent diffusion model to enable swift inference with minimal steps. This example demonstrates how to use latent consistency distillation to distill SDXL for inference with few timesteps.

 ## Full model distillation

@@ -24,7 +24,7 @@ Then cd in the example folder and run
 pip install -r requirements.txt
 ```

-And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+And initialize an [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:

 ```bash
 accelerate config
@@ -46,12 +46,16 @@ write_basic_config()
 When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.


-#### Example with LAION-A6+ dataset
+#### Example
+
+The following uses the [Conceptual Captions 12M (CC12M) dataset](https://github.com/google-research-datasets/conceptual-12m) as an example, and for illustrative purposes only. For best results you may consider large and high-quality text-image datasets such as [LAION](https://laion.ai/blog/laion-400-open-dataset/). You may also need to search the hyperparameter space according to the dataset you use.

 ```bash
-export MODEL_DIR="stabilityai/stable-diffusion-xl-base-1.0"
-PROGRAM="train_lcm_distill_sdxl_wds.py \
-    --pretrained_teacher_model=$MODEL_DIR \
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export OUTPUT_DIR="path/to/saved/model"
+
+accelerate launch train_lcm_distill_sdxl_wds.py \
+    --pretrained_teacher_model=$MODEL_NAME \
    --pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
    --output_dir=$OUTPUT_DIR \
    --mixed_precision=fp16 \
@@ -60,7 +64,7 @@ PROGRAM="train_lcm_distill_sdxl_wds.py \
    --max_train_steps=1000 \
    --max_train_samples=4000000 \
    --dataloader_num_workers=8 \
-    --train_shards_path_or_url='pipe:aws s3 cp s3://muse-datasets/laion-aesthetic6plus-min512-data/{00000..01210}.tar -' \
+    --train_shards_path_or_url="pipe:curl -L -s https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset/resolve/main/data/{00000..01099}.tar?download=true" \
    --validation_steps=200 \
    --checkpointing_steps=200 --checkpoints_total_limit=10 \
    --train_batch_size=12 \
@@ -77,22 +81,26 @@ PROGRAM="train_lcm_distill_sdxl_wds.py \

 Instead of fine-tuning the full model, we can also just train a LoRA that can be injected into any SDXL model.

-### Example with LAION-A6+ dataset
-    
+### Example
+
+The following uses the [Conceptual Captions 12M (CC12M) dataset](https://github.com/google-research-datasets/conceptual-12m) as an example. For best results you may consider large and high-quality text-image datasets such as [LAION](https://laion.ai/blog/laion-400-open-dataset/).
+
 ```bash
-export MODEL_DIR="stabilityai/stable-diffusion-xl-base-1.0"
-PROGRAM="train_lcm_distill_lora_sdxl_wds.py \
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export OUTPUT_DIR="path/to/saved/model"
+
+accelerate launch train_lcm_distill_lora_sdxl_wds.py \
    --pretrained_teacher_model=$MODEL_DIR \
    --pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
    --output_dir=$OUTPUT_DIR \
    --mixed_precision=fp16 \
    --resolution=1024 \
    --lora_rank=64 \
-    --learning_rate=1e-6 --loss_type="huber" --use_fix_crop_and_size --adam_weight_decay=0.0 \
+    --learning_rate=1e-4 --loss_type="huber" --use_fix_crop_and_size --adam_weight_decay=0.0 \
    --max_train_steps=1000 \
    --max_train_samples=4000000 \
    --dataloader_num_workers=8 \
-    --train_shards_path_or_url='pipe:aws s3 cp s3://muse-datasets/laion-aesthetic6plus-min512-data/{00000..01210}.tar -' \
+    --train_shards_path_or_url="pipe:curl -L -s https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset/resolve/main/data/{00000..01099}.tar?download=true" \
    --validation_steps=200 \
    --checkpointing_steps=200 --checkpoints_total_limit=10 \
    --train_batch_size=12 \
@@ -103,4 +111,38 @@ PROGRAM="train_lcm_distill_lora_sdxl_wds.py \
    --report_to=wandb \
    --seed=453645634 \
    --push_to_hub \
-```
+```
+
+We provide another version for LCM LoRA SDXL that follows best practices of `peft` and leverages the `datasets` library for quick experimentation. The script doesn't load two UNets unlike `train_lcm_distill_lora_sdxl_wds.py` which reduces the memory requirements quite a bit. 
+
+Below is an example training command that trains an LCM LoRA on the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions):
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export VAE_PATH="madebyollin/sdxl-vae-fp16-fix"
+
+accelerate launch train_lcm_distill_lora_sdxl.py \
+  --pretrained_teacher_model=${MODEL_NAME}  \
+  --pretrained_vae_model_name_or_path=${VAE_PATH} \
+  --output_dir="pokemons-lora-lcm-sdxl" \
+  --mixed_precision="fp16" \
+  --dataset_name=$DATASET_NAME \
+  --resolution=1024 \
+  --train_batch_size=24 \
+  --gradient_accumulation_steps=1 \
+  --gradient_checkpointing \
+  --use_8bit_adam \
+  --lora_rank=64 \
+  --learning_rate=1e-4 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=3000 \
+  --checkpointing_steps=500 \
+  --validation_steps=50 \
+  --seed="0" \
+  --report_to="wandb" \
+  --push_to_hub
+```
+
--- a/examples/consistency_distillation/test_lcm_lora.py
+++ b/examples/consistency_distillation/test_lcm_lora.py
@@ -0,0 +1,112 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+import safetensors
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class TextToImageLCM(ExamplesTestsAccelerate):
+    def test_text_to_image_lcm_lora_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
+                --pretrained_teacher_model hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --lora_rank 4
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+    def test_text_to_image_lcm_lora_sdxl_checkpointing(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
+                --pretrained_teacher_model hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --lora_rank 4
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 7
+                --checkpointing_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4", "checkpoint-6"},
+            )
+
+            test_args = f"""
+                examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
+                --pretrained_teacher_model hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --lora_rank 4
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 9
+                --checkpointing_steps 2
+                --resume_from_checkpoint latest
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
+            )
--- a/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py
@@ -38,7 +38,7 @@ from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from braceexpand import braceexpand
-from huggingface_hub import create_repo
+from huggingface_hub import create_repo, upload_folder
 from packaging import version
 from peft import LoraConfig, get_peft_model, get_peft_model_state_dict
 from torch.utils.data import default_collate
@@ -71,7 +71,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.18.0.dev0")
+check_min_version("0.25.0.dev0")

 logger = get_logger(__name__)

@@ -156,7 +156,7 @@ class WebdatasetFilter:
            return False


-class Text2ImageDataset:
+class SDText2ImageDataset:
    def __init__(
        self,
        train_shards_path_or_url: Union[str, List[str]],
@@ -359,19 +359,43 @@ def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=


 # Compare LCMScheduler.step, Step 4
-def predicted_origin(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
    if prediction_type == "epsilon":
-        sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
-        alphas = extract_into_tensor(alphas, timesteps, sample.shape)
        pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
    elif prediction_type == "v_prediction":
-        pred_x_0 = alphas[timesteps] * sample - sigmas[timesteps] * model_output
+        pred_x_0 = alphas * sample - sigmas * model_output
    else:
-        raise ValueError(f"Prediction type {prediction_type} currently not supported.")
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )

    return pred_x_0


+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
 def extract_into_tensor(a, t, x_shape):
    b, *_ = t.shape
    out = a.gather(-1, t)
@@ -423,7 +447,7 @@ def import_model_class_from_model_name_or_path(
    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
 ):
    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path, subfolder=subfolder, revision=revision, use_auth_token=True
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
    )
    model_class = text_encoder_config.architectures[0]

@@ -823,7 +847,7 @@ def main(args):
            os.makedirs(args.output_dir, exist_ok=True)

        if args.push_to_hub:
-            create_repo(
+            repo_id = create_repo(
                repo_id=args.hub_model_id or Path(args.output_dir).name,
                exist_ok=True,
                token=args.hub_token,
@@ -835,34 +859,35 @@ def main(args):
        args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
    )

-    # The scheduler calculates the alpha and sigma schedule for us
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
    alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
    sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
    solver = DDIMSolver(
        noise_scheduler.alphas_cumprod.numpy(),
        timesteps=noise_scheduler.config.num_train_timesteps,
        ddim_timesteps=args.num_ddim_timesteps,
    )

-    # 2. Load tokenizers from SD-XL checkpoint.
+    # 2. Load tokenizers from SD 1.X/2.X checkpoint.
    tokenizer = AutoTokenizer.from_pretrained(
        args.pretrained_teacher_model, subfolder="tokenizer", revision=args.teacher_revision, use_fast=False
    )

-    # 3. Load text encoders from SD-1.5 checkpoint.
+    # 3. Load text encoders from SD 1.X/2.X checkpoint.
    # import correct text encoder classes
    text_encoder = CLIPTextModel.from_pretrained(
        args.pretrained_teacher_model, subfolder="text_encoder", revision=args.teacher_revision
    )

-    # 4. Load VAE from SD-XL checkpoint (or more stable VAE)
+    # 4. Load VAE from SD 1.X/2.X checkpoint
    vae = AutoencoderKL.from_pretrained(
        args.pretrained_teacher_model,
        subfolder="vae",
        revision=args.teacher_revision,
    )

-    # 5. Load teacher U-Net from SD-XL checkpoint
+    # 5. Load teacher U-Net from SD 1.X/2.X checkpoint
    teacher_unet = UNet2DConditionModel.from_pretrained(
        args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
    )
@@ -872,7 +897,7 @@ def main(args):
    text_encoder.requires_grad_(False)
    teacher_unet.requires_grad_(False)

-    # 7. Create online (`unet`) student U-Nets.
+    # 7. Create online student U-Net.
    unet = UNet2DConditionModel.from_pretrained(
        args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
    )
@@ -935,6 +960,7 @@ def main(args):
    # Also move the alpha and sigma noise schedules to accelerator.device.
    alpha_schedule = alpha_schedule.to(accelerator.device)
    sigma_schedule = sigma_schedule.to(accelerator.device)
+    # Move the ODE solver to accelerator.device.
    solver = solver.to(accelerator.device)

    # 10. Handle saving and loading of checkpoints
@@ -1011,13 +1037,14 @@ def main(args):
        eps=args.adam_epsilon,
    )

+    # 13. Dataset creation and data processing
    # Here, we compute not just the text embeddings but also the additional embeddings
    # needed for the SD XL UNet to operate.
    def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tokenizer, is_train=True):
        prompt_embeds = encode_prompt(prompt_batch, text_encoder, tokenizer, proportion_empty_prompts, is_train)
        return {"prompt_embeds": prompt_embeds}

-    dataset = Text2ImageDataset(
+    dataset = SDText2ImageDataset(
        train_shards_path_or_url=args.train_shards_path_or_url,
        num_train_examples=args.max_train_samples,
        per_gpu_batch_size=args.train_batch_size,
@@ -1037,6 +1064,7 @@ def main(args):
        tokenizer=tokenizer,
    )

+    # 14. LR Scheduler creation
    # Scheduler and math around the number of training steps.
    overrode_max_train_steps = False
    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
@@ -1051,6 +1079,7 @@ def main(args):
        num_training_steps=args.max_train_steps,
    )

+    # 15. Prepare for training
    # Prepare everything with our `accelerator`.
    unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)

@@ -1072,7 +1101,7 @@ def main(args):
    ).input_ids.to(accelerator.device)
    uncond_prompt_embeds = text_encoder(uncond_input_ids)[0]

-    # Train!
+    # 16. Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
@@ -1123,7 +1152,8 @@ def main(args):
    for epoch in range(first_epoch, args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(unet):
-                image, text, _, _ = batch
+                # 1. Load and process the image and text conditioning
+                image, text = batch

                image = image.to(accelerator.device, non_blocking=True)
                encoded_text = compute_embeddings_fn(text)
@@ -1140,37 +1170,37 @@ def main(args):

                latents = latents * vae.config.scaling_factor
                latents = latents.to(weight_dtype)
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
                bsz = latents.shape[0]

-                # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias.
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
                topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
                index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
                start_timesteps = solver.ddim_timesteps[index]
                timesteps = start_timesteps - topk
                timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)

-                # 20.4.4. Get boundary scalings for start_timesteps and (end) timesteps.
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
                c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps)
                c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
                c_skip, c_out = scalings_for_boundary_conditions(timesteps)
                c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]

-                # 20.4.5. Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
                noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)

-                # 20.4.6. Sample a random guidance scale w from U[w_min, w_max] and embed it
+                # 5. Sample a random guidance scale w from U[w_min, w_max]
+                # Note that for LCM-LoRA distillation it is not necessary to use a guidance scale embedding
                w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
                w = w.reshape(bsz, 1, 1, 1)
                w = w.to(device=latents.device, dtype=latents.dtype)

-                # 20.4.8. Prepare prompt embeds and unet_added_conditions
+                # 6. Prepare prompt embeds and unet_added_conditions
                prompt_embeds = encoded_text.pop("prompt_embeds")

-                # 20.4.9. Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k}
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
                noise_pred = unet(
                    noisy_model_input,
                    start_timesteps,
@@ -1179,7 +1209,7 @@ def main(args):
                    added_cond_kwargs=encoded_text,
                ).sample

-                pred_x_0 = predicted_origin(
+                pred_x_0 = get_predicted_original_sample(
                    noise_pred,
                    start_timesteps,
                    noisy_model_input,
@@ -1190,17 +1220,27 @@ def main(args):

                model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0

-                # 20.4.10. Use the ODE solver to predict the kth step in the augmented PF-ODE trajectory after
-                # noisy_latents with both the conditioning embedding c and unconditional embedding 0
-                # Get teacher model prediction on noisy_latents and conditional embedding
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
                with torch.no_grad():
                    with torch.autocast("cuda"):
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
                            start_timesteps,
                            encoder_hidden_states=prompt_embeds.to(weight_dtype),
                        ).sample
-                        cond_pred_x0 = predicted_origin(
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
                            cond_teacher_output,
                            start_timesteps,
                            noisy_model_input,
@@ -1209,13 +1249,21 @@ def main(args):
                            sigma_schedule,
                        )

-                        # Get teacher model prediction on noisy_latents and unconditional embedding
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
                        uncond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
                            start_timesteps,
                            encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
                        ).sample
-                        uncond_pred_x0 = predicted_origin(
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
                            uncond_teacher_output,
                            start_timesteps,
                            noisy_model_input,
@@ -1224,12 +1272,17 @@ def main(args):
                            sigma_schedule,
                        )

-                        # 20.4.11. Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation)
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
                        pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
-                        pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
                        x_prev = solver.ddim_step(pred_x0, pred_noise, index)

-                # 20.4.12. Get target LCM prediction on x_prev, w, c, t_n
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
+                # Note that we do not use a separate target network for LCM-LoRA distillation.
                with torch.no_grad():
                    with torch.autocast("cuda", dtype=weight_dtype):
                        target_noise_pred = unet(
@@ -1238,7 +1291,7 @@ def main(args):
                            timestep_cond=None,
                            encoder_hidden_states=prompt_embeds.float(),
                        ).sample
-                    pred_x_0 = predicted_origin(
+                    pred_x_0 = get_predicted_original_sample(
                        target_noise_pred,
                        timesteps,
                        x_prev,
@@ -1248,7 +1301,7 @@ def main(args):
                    )
                    target = c_skip * x_prev + c_out * pred_x_0

-                # 20.4.13. Calculate loss
+                # 10. Calculate loss
                if args.loss_type == "l2":
                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                elif args.loss_type == "huber":
@@ -1256,7 +1309,7 @@ def main(args):
                        torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
                    )

-                # 20.4.14. Backpropagate on the online student model (`unet`)
+                # 11. Backpropagate on the online student model (`unet`)
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
@@ -1313,6 +1366,14 @@ def main(args):
        lora_state_dict = get_peft_model_state_dict(unet, adapter_name="default")
        StableDiffusionPipeline.save_lora_weights(os.path.join(args.output_dir, "unet_lora"), lora_state_dict)

+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
    accelerator.end_training()


--- a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
+++ b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
--- a/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py
@@ -39,7 +39,7 @@ from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from braceexpand import braceexpand
-from huggingface_hub import create_repo
+from huggingface_hub import create_repo, upload_folder
 from packaging import version
 from peft import LoraConfig, get_peft_model, get_peft_model_state_dict
 from torch.utils.data import default_collate
@@ -68,11 +68,16 @@ from diffusers.utils.import_utils import is_xformers_available

 MAX_SEQ_LENGTH = 77

+# Adjust for your dataset
+WDS_JSON_WIDTH = "width"  # original_width for LAION
+WDS_JSON_HEIGHT = "height"  # original_height for LAION
+MIN_SIZE = 700  # ~960 for LAION, ideal: 1024 if the dataset contains large images
+
 if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.18.0.dev0")
+check_min_version("0.25.0.dev0")

 logger = get_logger(__name__)

@@ -146,10 +151,10 @@ class WebdatasetFilter:
        try:
            if "json" in x:
                x_json = json.loads(x["json"])
-                filter_size = (x_json.get("original_width", 0.0) or 0.0) >= self.min_size and x_json.get(
-                    "original_height", 0
+                filter_size = (x_json.get(WDS_JSON_WIDTH, 0.0) or 0.0) >= self.min_size and x_json.get(
+                    WDS_JSON_HEIGHT, 0
                ) >= self.min_size
-                filter_watermark = (x_json.get("pwatermark", 1.0) or 1.0) <= self.max_pwatermark
+                filter_watermark = (x_json.get("pwatermark", 0.0) or 0.0) <= self.max_pwatermark
                return filter_size and filter_watermark
            else:
                return False
@@ -157,7 +162,7 @@ class WebdatasetFilter:
            return False


-class Text2ImageDataset:
+class SDXLText2ImageDataset:
    def __init__(
        self,
        train_shards_path_or_url: Union[str, List[str]],
@@ -180,7 +185,7 @@ class Text2ImageDataset:
            if use_fix_crop_and_size:
                return (resolution, resolution)
            else:
-                return (int(json.get("original_width", 0.0)), int(json.get("original_height", 0.0)))
+                return (int(json.get(WDS_JSON_WIDTH, 0.0)), int(json.get(WDS_JSON_HEIGHT, 0.0)))

        def transform(example):
            # resize image
@@ -212,7 +217,7 @@ class Text2ImageDataset:
        pipeline = [
            wds.ResampledShards(train_shards_path_or_url),
            tarfile_to_samples_nothrow,
-            wds.select(WebdatasetFilter(min_size=960)),
+            wds.select(WebdatasetFilter(min_size=MIN_SIZE)),
            wds.shuffle(shuffle_buffer_size),
            *processing_pipeline,
            wds.batched(per_gpu_batch_size, partial=False, collation_fn=default_collate),
@@ -341,19 +346,43 @@ def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=


 # Compare LCMScheduler.step, Step 4
-def predicted_origin(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
    if prediction_type == "epsilon":
-        sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
-        alphas = extract_into_tensor(alphas, timesteps, sample.shape)
        pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
    elif prediction_type == "v_prediction":
-        pred_x_0 = alphas[timesteps] * sample - sigmas[timesteps] * model_output
+        pred_x_0 = alphas * sample - sigmas * model_output
    else:
-        raise ValueError(f"Prediction type {prediction_type} currently not supported.")
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )

    return pred_x_0


+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
 def extract_into_tensor(a, t, x_shape):
    b, *_ = t.shape
    out = a.gather(-1, t)
@@ -392,7 +421,7 @@ def import_model_class_from_model_name_or_path(
    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
 ):
    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path, subfolder=subfolder, revision=revision, use_auth_token=True
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
    )
    model_class = text_encoder_config.architectures[0]

@@ -813,7 +842,7 @@ def main(args):
            os.makedirs(args.output_dir, exist_ok=True)

        if args.push_to_hub:
-            create_repo(
+            repo_id = create_repo(
                repo_id=args.hub_model_id or Path(args.output_dir).name,
                exist_ok=True,
                token=args.hub_token,
@@ -825,9 +854,10 @@ def main(args):
        args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
    )

-    # The scheduler calculates the alpha and sigma schedule for us
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
    alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
    sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
    solver = DDIMSolver(
        noise_scheduler.alphas_cumprod.numpy(),
        timesteps=noise_scheduler.config.num_train_timesteps,
@@ -881,7 +911,7 @@ def main(args):
    text_encoder_two.requires_grad_(False)
    teacher_unet.requires_grad_(False)

-    # 7. Create online (`unet`) student U-Nets.
+    # 7. Create online student U-Net.
    unet = UNet2DConditionModel.from_pretrained(
        args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
    )
@@ -945,6 +975,7 @@ def main(args):
    # Also move the alpha and sigma noise schedules to accelerator.device.
    alpha_schedule = alpha_schedule.to(accelerator.device)
    sigma_schedule = sigma_schedule.to(accelerator.device)
+    # Move the ODE solver to accelerator.device.
    solver = solver.to(accelerator.device)

    # 10. Handle saving and loading of checkpoints
@@ -1052,7 +1083,7 @@ def main(args):

        return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}

-    dataset = Text2ImageDataset(
+    dataset = SDXLText2ImageDataset(
        train_shards_path_or_url=args.train_shards_path_or_url,
        num_train_examples=args.max_train_samples,
        per_gpu_batch_size=args.train_batch_size,
@@ -1170,6 +1201,7 @@ def main(args):
    for epoch in range(first_epoch, args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(unet):
+                # 1. Load and process the image, text, and micro-conditioning (original image size, crop coordinates)
                image, text, orig_size, crop_coords = batch

                image = image.to(accelerator.device, non_blocking=True)
@@ -1191,37 +1223,37 @@ def main(args):
                latents = latents * vae.config.scaling_factor
                if args.pretrained_vae_model_name_or_path is None:
                    latents = latents.to(weight_dtype)
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
                bsz = latents.shape[0]

-                # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias.
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
                topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
                index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
                start_timesteps = solver.ddim_timesteps[index]
                timesteps = start_timesteps - topk
                timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)

-                # 20.4.4. Get boundary scalings for start_timesteps and (end) timesteps.
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
                c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps)
                c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
                c_skip, c_out = scalings_for_boundary_conditions(timesteps)
                c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]

-                # 20.4.5. Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
                noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)

-                # 20.4.6. Sample a random guidance scale w from U[w_min, w_max] and embed it
+                # 5. Sample a random guidance scale w from U[w_min, w_max]
+                # Note that for LCM-LoRA distillation it is not necessary to use a guidance scale embedding
                w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
                w = w.reshape(bsz, 1, 1, 1)
                w = w.to(device=latents.device, dtype=latents.dtype)

-                # 20.4.8. Prepare prompt embeds and unet_added_conditions
+                # 6. Prepare prompt embeds and unet_added_conditions
                prompt_embeds = encoded_text.pop("prompt_embeds")

-                # 20.4.9. Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k}
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
                noise_pred = unet(
                    noisy_model_input,
                    start_timesteps,
@@ -1230,7 +1262,7 @@ def main(args):
                    added_cond_kwargs=encoded_text,
                ).sample

-                pred_x_0 = predicted_origin(
+                pred_x_0 = get_predicted_original_sample(
                    noise_pred,
                    start_timesteps,
                    noisy_model_input,
@@ -1241,18 +1273,28 @@ def main(args):

                model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0

-                # 20.4.10. Use the ODE solver to predict the kth step in the augmented PF-ODE trajectory after
-                # noisy_latents with both the conditioning embedding c and unconditional embedding 0
-                # Get teacher model prediction on noisy_latents and conditional embedding
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
                with torch.no_grad():
                    with torch.autocast("cuda"):
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
                            start_timesteps,
                            encoder_hidden_states=prompt_embeds.to(weight_dtype),
                            added_cond_kwargs={k: v.to(weight_dtype) for k, v in encoded_text.items()},
                        ).sample
-                        cond_pred_x0 = predicted_origin(
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
                            cond_teacher_output,
                            start_timesteps,
                            noisy_model_input,
@@ -1261,7 +1303,7 @@ def main(args):
                            sigma_schedule,
                        )

-                        # Get teacher model prediction on noisy_latents and unconditional embedding
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
                        uncond_added_conditions = copy.deepcopy(encoded_text)
                        uncond_added_conditions["text_embeds"] = uncond_pooled_prompt_embeds
                        uncond_teacher_output = teacher_unet(
@@ -1270,7 +1312,15 @@ def main(args):
                            encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
                            added_cond_kwargs={k: v.to(weight_dtype) for k, v in uncond_added_conditions.items()},
                        ).sample
-                        uncond_pred_x0 = predicted_origin(
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
                            uncond_teacher_output,
                            start_timesteps,
                            noisy_model_input,
@@ -1279,12 +1329,17 @@ def main(args):
                            sigma_schedule,
                        )

-                        # 20.4.11. Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation)
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
                        pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
-                        pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
                        x_prev = solver.ddim_step(pred_x0, pred_noise, index)

-                # 20.4.12. Get target LCM prediction on x_prev, w, c, t_n
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
+                # Note that we do not use a separate target network for LCM-LoRA distillation.
                with torch.no_grad():
                    with torch.autocast("cuda", enabled=True, dtype=weight_dtype):
                        target_noise_pred = unet(
@@ -1294,7 +1349,7 @@ def main(args):
                            encoder_hidden_states=prompt_embeds.float(),
                            added_cond_kwargs=encoded_text,
                        ).sample
-                    pred_x_0 = predicted_origin(
+                    pred_x_0 = get_predicted_original_sample(
                        target_noise_pred,
                        timesteps,
                        x_prev,
@@ -1304,7 +1359,7 @@ def main(args):
                    )
                    target = c_skip * x_prev + c_out * pred_x_0

-                # 20.4.13. Calculate loss
+                # 10. Calculate loss
                if args.loss_type == "l2":
                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                elif args.loss_type == "huber":
@@ -1312,7 +1367,7 @@ def main(args):
                        torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
                    )

-                # 20.4.14. Backpropagate on the online student model (`unet`)
+                # 11. Backpropagate on the online student model (`unet`)
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
@@ -1369,6 +1424,14 @@ def main(args):
        lora_state_dict = get_peft_model_state_dict(unet, adapter_name="default")
        StableDiffusionXLPipeline.save_lora_weights(os.path.join(args.output_dir, "unet_lora"), lora_state_dict)

+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
    accelerator.end_training()


--- a/examples/consistency_distillation/train_lcm_distill_sd_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_sd_wds.py
@@ -38,7 +38,7 @@ from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from braceexpand import braceexpand
-from huggingface_hub import create_repo
+from huggingface_hub import create_repo, upload_folder
 from packaging import version
 from torch.utils.data import default_collate
 from torchvision import transforms
@@ -70,7 +70,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.18.0.dev0")
+check_min_version("0.25.0.dev0")

 logger = get_logger(__name__)

@@ -138,7 +138,7 @@ class WebdatasetFilter:
            return False


-class Text2ImageDataset:
+class SDText2ImageDataset:
    def __init__(
        self,
        train_shards_path_or_url: Union[str, List[str]],
@@ -336,19 +336,43 @@ def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=


 # Compare LCMScheduler.step, Step 4
-def predicted_origin(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
    if prediction_type == "epsilon":
-        sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
-        alphas = extract_into_tensor(alphas, timesteps, sample.shape)
        pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
    elif prediction_type == "v_prediction":
-        pred_x_0 = alphas[timesteps] * sample - sigmas[timesteps] * model_output
+        pred_x_0 = alphas * sample - sigmas * model_output
    else:
-        raise ValueError(f"Prediction type {prediction_type} currently not supported.")
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )

    return pred_x_0


+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
 def extract_into_tensor(a, t, x_shape):
    b, *_ = t.shape
    out = a.gather(-1, t)
@@ -400,7 +424,7 @@ def import_model_class_from_model_name_or_path(
    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
 ):
    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path, subfolder=subfolder, revision=revision, use_auth_token=True
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
    )
    model_class = text_encoder_config.architectures[0]

@@ -657,6 +681,15 @@ def parse_args():
        default=0.001,
        help="The huber loss parameter. Only used if `--loss_type=huber`.",
    )
+    parser.add_argument(
+        "--unet_time_cond_proj_dim",
+        type=int,
+        default=256,
+        help=(
+            "The dimension of the guidance scale embedding in the U-Net, which will be used if the teacher U-Net"
+            " does not have `time_cond_proj_dim` set."
+        ),
+    )
    # ----Exponential Moving Average (EMA)----
    parser.add_argument(
        "--ema_decay",
@@ -802,7 +835,7 @@ def main(args):
            os.makedirs(args.output_dir, exist_ok=True)

        if args.push_to_hub:
-            create_repo(
+            repo_id = create_repo(
                repo_id=args.hub_model_id or Path(args.output_dir).name,
                exist_ok=True,
                token=args.hub_token,
@@ -814,34 +847,35 @@ def main(args):
        args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
    )

-    # The scheduler calculates the alpha and sigma schedule for us
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
    alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
    sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
    solver = DDIMSolver(
        noise_scheduler.alphas_cumprod.numpy(),
        timesteps=noise_scheduler.config.num_train_timesteps,
        ddim_timesteps=args.num_ddim_timesteps,
    )

-    # 2. Load tokenizers from SD-XL checkpoint.
+    # 2. Load tokenizers from SD 1.X/2.X checkpoint.
    tokenizer = AutoTokenizer.from_pretrained(
        args.pretrained_teacher_model, subfolder="tokenizer", revision=args.teacher_revision, use_fast=False
    )

-    # 3. Load text encoders from SD-1.5 checkpoint.
+    # 3. Load text encoders from SD 1.X/2.X checkpoint.
    # import correct text encoder classes
    text_encoder = CLIPTextModel.from_pretrained(
        args.pretrained_teacher_model, subfolder="text_encoder", revision=args.teacher_revision
    )

-    # 4. Load VAE from SD-XL checkpoint (or more stable VAE)
+    # 4. Load VAE from SD 1.X/2.X checkpoint
    vae = AutoencoderKL.from_pretrained(
        args.pretrained_teacher_model,
        subfolder="vae",
        revision=args.teacher_revision,
    )

-    # 5. Load teacher U-Net from SD-XL checkpoint
+    # 5. Load teacher U-Net from SD 1.X/2.X checkpoint
    teacher_unet = UNet2DConditionModel.from_pretrained(
        args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
    )
@@ -851,17 +885,18 @@ def main(args):
    text_encoder.requires_grad_(False)
    teacher_unet.requires_grad_(False)

-    # 8. Create online (`unet`) student U-Nets. This will be updated by the optimizer (e.g. via backpropagation.)
+    # 7. Create online student U-Net. This will be updated by the optimizer (e.g. via backpropagation.)
    # Add `time_cond_proj_dim` to the student U-Net if `teacher_unet.config.time_cond_proj_dim` is None
    if teacher_unet.config.time_cond_proj_dim is None:
        teacher_unet.config["time_cond_proj_dim"] = args.unet_time_cond_proj_dim
+    time_cond_proj_dim = teacher_unet.config.time_cond_proj_dim
    unet = UNet2DConditionModel(**teacher_unet.config)
    # load teacher_unet weights into unet
    unet.load_state_dict(teacher_unet.state_dict(), strict=False)
    unet.train()

-    # 9. Create target (`ema_unet`) student U-Net parameters. This will be updated via EMA updates (polyak averaging).
-    # Initialize from unet
+    # 8. Create target student U-Net. This will be updated via EMA updates (polyak averaging).
+    # Initialize from (online) unet
    target_unet = UNet2DConditionModel(**teacher_unet.config)
    target_unet.load_state_dict(unet.state_dict())
    target_unet.train()
@@ -878,7 +913,7 @@ def main(args):
            f"Controlnet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}"
        )

-    # 10. Handle mixed precision and device placement
+    # 9. Handle mixed precision and device placement
    # For mixed precision training we cast all non-trainable weigths to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
@@ -905,7 +940,7 @@ def main(args):
    sigma_schedule = sigma_schedule.to(accelerator.device)
    solver = solver.to(accelerator.device)

-    # 11. Handle saving and loading of checkpoints
+    # 10. Handle saving and loading of checkpoints
    # `accelerate` 0.16.0 will have better support for customized saving
    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
@@ -939,7 +974,7 @@ def main(args):
        accelerator.register_save_state_pre_hook(save_model_hook)
        accelerator.register_load_state_pre_hook(load_model_hook)

-    # 12. Enable optimizations
+    # 11. Enable optimizations
    if args.enable_xformers_memory_efficient_attention:
        if is_xformers_available():
            import xformers
@@ -985,13 +1020,14 @@ def main(args):
        eps=args.adam_epsilon,
    )

+    # 13. Dataset creation and data processing
    # Here, we compute not just the text embeddings but also the additional embeddings
    # needed for the SD XL UNet to operate.
    def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tokenizer, is_train=True):
        prompt_embeds = encode_prompt(prompt_batch, text_encoder, tokenizer, proportion_empty_prompts, is_train)
        return {"prompt_embeds": prompt_embeds}

-    dataset = Text2ImageDataset(
+    dataset = SDText2ImageDataset(
        train_shards_path_or_url=args.train_shards_path_or_url,
        num_train_examples=args.max_train_samples,
        per_gpu_batch_size=args.train_batch_size,
@@ -1011,6 +1047,7 @@ def main(args):
        tokenizer=tokenizer,
    )

+    # 14. LR Scheduler creation
    # Scheduler and math around the number of training steps.
    overrode_max_train_steps = False
    num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
@@ -1025,6 +1062,7 @@ def main(args):
        num_training_steps=args.max_train_steps,
    )

+    # 15. Prepare for training
    # Prepare everything with our `accelerator`.
    unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)

@@ -1046,7 +1084,7 @@ def main(args):
    ).input_ids.to(accelerator.device)
    uncond_prompt_embeds = text_encoder(uncond_input_ids)[0]

-    # Train!
+    # 16. Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
@@ -1097,7 +1135,8 @@ def main(args):
    for epoch in range(first_epoch, args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(unet):
-                image, text, _, _ = batch
+                # 1. Load and process the image and text conditioning
+                image, text = batch

                image = image.to(accelerator.device, non_blocking=True)
                encoded_text = compute_embeddings_fn(text)
@@ -1114,40 +1153,39 @@ def main(args):

                latents = latents * vae.config.scaling_factor
                latents = latents.to(weight_dtype)
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
                bsz = latents.shape[0]

-                # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias.
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
                topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
                index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
                start_timesteps = solver.ddim_timesteps[index]
                timesteps = start_timesteps - topk
                timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)

-                # 20.4.4. Get boundary scalings for start_timesteps and (end) timesteps.
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
                c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps)
                c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
                c_skip, c_out = scalings_for_boundary_conditions(timesteps)
                c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]

-                # 20.4.5. Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
                noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)

-                # 20.4.6. Sample a random guidance scale w from U[w_min, w_max] and embed it
+                # 5. Sample a random guidance scale w from U[w_min, w_max] and embed it
                w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
-                w_embedding = guidance_scale_embedding(w, embedding_dim=args.unet_time_cond_proj_dim)
+                w_embedding = guidance_scale_embedding(w, embedding_dim=time_cond_proj_dim)
                w = w.reshape(bsz, 1, 1, 1)
                # Move to U-Net device and dtype
                w = w.to(device=latents.device, dtype=latents.dtype)
                w_embedding = w_embedding.to(device=latents.device, dtype=latents.dtype)

-                # 20.4.8. Prepare prompt embeds and unet_added_conditions
+                # 6. Prepare prompt embeds and unet_added_conditions
                prompt_embeds = encoded_text.pop("prompt_embeds")

-                # 20.4.9. Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k}
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
                noise_pred = unet(
                    noisy_model_input,
                    start_timesteps,
@@ -1156,7 +1194,7 @@ def main(args):
                    added_cond_kwargs=encoded_text,
                ).sample

-                pred_x_0 = predicted_origin(
+                pred_x_0 = get_predicted_original_sample(
                    noise_pred,
                    start_timesteps,
                    noisy_model_input,
@@ -1167,17 +1205,27 @@ def main(args):

                model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0

-                # 20.4.10. Use the ODE solver to predict the kth step in the augmented PF-ODE trajectory after
-                # noisy_latents with both the conditioning embedding c and unconditional embedding 0
-                # Get teacher model prediction on noisy_latents and conditional embedding
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
                with torch.no_grad():
                    with torch.autocast("cuda"):
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
                            start_timesteps,
                            encoder_hidden_states=prompt_embeds.to(weight_dtype),
                        ).sample
-                        cond_pred_x0 = predicted_origin(
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
                            cond_teacher_output,
                            start_timesteps,
                            noisy_model_input,
@@ -1186,13 +1234,21 @@ def main(args):
                            sigma_schedule,
                        )

-                        # Get teacher model prediction on noisy_latents and unconditional embedding
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
                        uncond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
                            start_timesteps,
                            encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
                        ).sample
-                        uncond_pred_x0 = predicted_origin(
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
                            uncond_teacher_output,
                            start_timesteps,
                            noisy_model_input,
@@ -1201,12 +1257,16 @@ def main(args):
                            sigma_schedule,
                        )

-                        # 20.4.11. Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation)
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
                        pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
-                        pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
                        x_prev = solver.ddim_step(pred_x0, pred_noise, index)

-                # 20.4.12. Get target LCM prediction on x_prev, w, c, t_n
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                with torch.no_grad():
                    with torch.autocast("cuda", dtype=weight_dtype):
                        target_noise_pred = target_unet(
@@ -1215,7 +1275,7 @@ def main(args):
                            timestep_cond=w_embedding,
                            encoder_hidden_states=prompt_embeds.float(),
                        ).sample
-                    pred_x_0 = predicted_origin(
+                    pred_x_0 = get_predicted_original_sample(
                        target_noise_pred,
                        timesteps,
                        x_prev,
@@ -1225,7 +1285,7 @@ def main(args):
                    )
                    target = c_skip * x_prev + c_out * pred_x_0

-                # 20.4.13. Calculate loss
+                # 10. Calculate loss
                if args.loss_type == "l2":
                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                elif args.loss_type == "huber":
@@ -1233,7 +1293,7 @@ def main(args):
                        torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
                    )

-                # 20.4.14. Backpropagate on the online student model (`unet`)
+                # 11. Backpropagate on the online student model (`unet`)
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
@@ -1243,7 +1303,7 @@ def main(args):

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
-                # 20.4.15. Make EMA update to target student model parameters
+                # 12. Make EMA update to target student model parameters (`target_unet`)
                update_ema(target_unet.parameters(), unet.parameters(), args.ema_decay)
                progress_bar.update(1)
                global_step += 1
@@ -1294,6 +1354,14 @@ def main(args):
        target_unet = accelerator.unwrap_model(target_unet)
        target_unet.save_pretrained(os.path.join(args.output_dir, "unet_target"))

+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
    accelerator.end_training()


--- a/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py
@@ -39,7 +39,7 @@ from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from braceexpand import braceexpand
-from huggingface_hub import create_repo
+from huggingface_hub import create_repo, upload_folder
 from packaging import version
 from torch.utils.data import default_collate
 from torchvision import transforms
@@ -67,11 +67,16 @@ from diffusers.utils.import_utils import is_xformers_available

 MAX_SEQ_LENGTH = 77

+# Adjust for your dataset
+WDS_JSON_WIDTH = "width"  # original_width for LAION
+WDS_JSON_HEIGHT = "height"  # original_height for LAION
+MIN_SIZE = 700  # ~960 for LAION, ideal: 1024 if the dataset contains large images
+
 if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.18.0.dev0")
+check_min_version("0.25.0.dev0")

 logger = get_logger(__name__)

@@ -128,10 +133,10 @@ class WebdatasetFilter:
        try:
            if "json" in x:
                x_json = json.loads(x["json"])
-                filter_size = (x_json.get("original_width", 0.0) or 0.0) >= self.min_size and x_json.get(
-                    "original_height", 0
+                filter_size = (x_json.get(WDS_JSON_WIDTH, 0.0) or 0.0) >= self.min_size and x_json.get(
+                    WDS_JSON_HEIGHT, 0
                ) >= self.min_size
-                filter_watermark = (x_json.get("pwatermark", 1.0) or 1.0) <= self.max_pwatermark
+                filter_watermark = (x_json.get("pwatermark", 0.0) or 0.0) <= self.max_pwatermark
                return filter_size and filter_watermark
            else:
                return False
@@ -139,7 +144,7 @@ class WebdatasetFilter:
            return False


-class Text2ImageDataset:
+class SDXLText2ImageDataset:
    def __init__(
        self,
        train_shards_path_or_url: Union[str, List[str]],
@@ -162,7 +167,7 @@ class Text2ImageDataset:
            if use_fix_crop_and_size:
                return (resolution, resolution)
            else:
-                return (int(json.get("original_width", 0.0)), int(json.get("original_height", 0.0)))
+                return (int(json.get(WDS_JSON_WIDTH, 0.0)), int(json.get(WDS_JSON_HEIGHT, 0.0)))

        def transform(example):
            # resize image
@@ -194,7 +199,7 @@ class Text2ImageDataset:
        pipeline = [
            wds.ResampledShards(train_shards_path_or_url),
            tarfile_to_samples_nothrow,
-            wds.select(WebdatasetFilter(min_size=960)),
+            wds.select(WebdatasetFilter(min_size=MIN_SIZE)),
            wds.shuffle(shuffle_buffer_size),
            *processing_pipeline,
            wds.batched(per_gpu_batch_size, partial=False, collation_fn=default_collate),
@@ -319,19 +324,43 @@ def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=


 # Compare LCMScheduler.step, Step 4
-def predicted_origin(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
    if prediction_type == "epsilon":
-        sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
-        alphas = extract_into_tensor(alphas, timesteps, sample.shape)
        pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
    elif prediction_type == "v_prediction":
-        pred_x_0 = alphas[timesteps] * sample - sigmas[timesteps] * model_output
+        pred_x_0 = alphas * sample - sigmas * model_output
    else:
-        raise ValueError(f"Prediction type {prediction_type} currently not supported.")
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )

    return pred_x_0


+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
 def extract_into_tensor(a, t, x_shape):
    b, *_ = t.shape
    out = a.gather(-1, t)
@@ -414,7 +443,7 @@ def import_model_class_from_model_name_or_path(
    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
 ):
    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path, subfolder=subfolder, revision=revision, use_auth_token=True
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
    )
    model_class = text_encoder_config.architectures[0]

@@ -677,6 +706,15 @@ def parse_args():
        default=0.001,
        help="The huber loss parameter. Only used if `--loss_type=huber`.",
    )
+    parser.add_argument(
+        "--unet_time_cond_proj_dim",
+        type=int,
+        default=256,
+        help=(
+            "The dimension of the guidance scale embedding in the U-Net, which will be used if the teacher U-Net"
+            " does not have `time_cond_proj_dim` set."
+        ),
+    )
    # ----Exponential Moving Average (EMA)----
    parser.add_argument(
        "--ema_decay",
@@ -837,7 +875,7 @@ def main(args):
            os.makedirs(args.output_dir, exist_ok=True)

        if args.push_to_hub:
-            create_repo(
+            repo_id = create_repo(
                repo_id=args.hub_model_id or Path(args.output_dir).name,
                exist_ok=True,
                token=args.hub_token,
@@ -849,9 +887,10 @@ def main(args):
        args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
    )

-    # The scheduler calculates the alpha and sigma schedule for us
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
    alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
    sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
    solver = DDIMSolver(
        noise_scheduler.alphas_cumprod.numpy(),
        timesteps=noise_scheduler.config.num_train_timesteps,
@@ -905,17 +944,18 @@ def main(args):
    text_encoder_two.requires_grad_(False)
    teacher_unet.requires_grad_(False)

-    # 8. Create online (`unet`) student U-Nets. This will be updated by the optimizer (e.g. via backpropagation.)
+    # 7. Create online student U-Net. This will be updated by the optimizer (e.g. via backpropagation.)
    # Add `time_cond_proj_dim` to the student U-Net if `teacher_unet.config.time_cond_proj_dim` is None
    if teacher_unet.config.time_cond_proj_dim is None:
        teacher_unet.config["time_cond_proj_dim"] = args.unet_time_cond_proj_dim
+    time_cond_proj_dim = teacher_unet.config.time_cond_proj_dim
    unet = UNet2DConditionModel(**teacher_unet.config)
    # load teacher_unet weights into unet
    unet.load_state_dict(teacher_unet.state_dict(), strict=False)
    unet.train()

-    # 9. Create target (`ema_unet`) student U-Net parameters. This will be updated via EMA updates (polyak averaging).
-    # Initialize from unet
+    # 8. Create target student U-Net. This will be updated via EMA updates (polyak averaging).
+    # Initialize from (online) unet
    target_unet = UNet2DConditionModel(**teacher_unet.config)
    target_unet.load_state_dict(unet.state_dict())
    target_unet.train()
@@ -957,6 +997,7 @@ def main(args):
    # Also move the alpha and sigma noise schedules to accelerator.device.
    alpha_schedule = alpha_schedule.to(accelerator.device)
    sigma_schedule = sigma_schedule.to(accelerator.device)
+    # Move the ODE solver to accelerator.device.
    solver = solver.to(accelerator.device)

    # 10. Handle saving and loading of checkpoints
@@ -1070,7 +1111,7 @@ def main(args):

        return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}

-    dataset = Text2ImageDataset(
+    dataset = SDXLText2ImageDataset(
        train_shards_path_or_url=args.train_shards_path_or_url,
        num_train_examples=args.max_train_samples,
        per_gpu_batch_size=args.train_batch_size,
@@ -1188,6 +1229,7 @@ def main(args):
    for epoch in range(first_epoch, args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(unet):
+                # 1. Load and process the image, text, and micro-conditioning (original image size, crop coordinates)
                image, text, orig_size, crop_coords = batch

                image = image.to(accelerator.device, non_blocking=True)
@@ -1209,46 +1251,48 @@ def main(args):
                latents = latents * vae.config.scaling_factor
                if args.pretrained_vae_model_name_or_path is None:
                    latents = latents.to(weight_dtype)
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
                bsz = latents.shape[0]

-                # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias.
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
                topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
                index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
                start_timesteps = solver.ddim_timesteps[index]
                timesteps = start_timesteps - topk
                timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)

-                # 20.4.4. Get boundary scalings for start_timesteps and (end) timesteps.
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
                c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps)
                c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
                c_skip, c_out = scalings_for_boundary_conditions(timesteps)
                c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]

-                # 20.4.5. Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
                noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)

-                # 20.4.6. Sample a random guidance scale w from U[w_min, w_max] and embed it
+                # 5. Sample a random guidance scale w from U[w_min, w_max] and embed it
                w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
+                w_embedding = guidance_scale_embedding(w, embedding_dim=time_cond_proj_dim)
                w = w.reshape(bsz, 1, 1, 1)
+                # Move to U-Net device and dtype
                w = w.to(device=latents.device, dtype=latents.dtype)
+                w_embedding = w_embedding.to(device=latents.device, dtype=latents.dtype)

-                # 20.4.8. Prepare prompt embeds and unet_added_conditions
+                # 6. Prepare prompt embeds and unet_added_conditions
                prompt_embeds = encoded_text.pop("prompt_embeds")

-                # 20.4.9. Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k}
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
                noise_pred = unet(
                    noisy_model_input,
                    start_timesteps,
-                    timestep_cond=None,
+                    timestep_cond=w_embedding,
                    encoder_hidden_states=prompt_embeds.float(),
                    added_cond_kwargs=encoded_text,
                ).sample

-                pred_x_0 = predicted_origin(
+                pred_x_0 = get_predicted_original_sample(
                    noise_pred,
                    start_timesteps,
                    noisy_model_input,
@@ -1259,18 +1303,28 @@ def main(args):

                model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0

-                # 20.4.10. Use the ODE solver to predict the kth step in the augmented PF-ODE trajectory after
-                # noisy_latents with both the conditioning embedding c and unconditional embedding 0
-                # Get teacher model prediction on noisy_latents and conditional embedding
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
                with torch.no_grad():
                    with torch.autocast("cuda"):
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                        cond_teacher_output = teacher_unet(
                            noisy_model_input.to(weight_dtype),
                            start_timesteps,
                            encoder_hidden_states=prompt_embeds.to(weight_dtype),
                            added_cond_kwargs={k: v.to(weight_dtype) for k, v in encoded_text.items()},
                        ).sample
-                        cond_pred_x0 = predicted_origin(
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
                            cond_teacher_output,
                            start_timesteps,
                            noisy_model_input,
@@ -1279,7 +1333,7 @@ def main(args):
                            sigma_schedule,
                        )

-                        # Get teacher model prediction on noisy_latents and unconditional embedding
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
                        uncond_added_conditions = copy.deepcopy(encoded_text)
                        uncond_added_conditions["text_embeds"] = uncond_pooled_prompt_embeds
                        uncond_teacher_output = teacher_unet(
@@ -1288,7 +1342,15 @@ def main(args):
                            encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
                            added_cond_kwargs={k: v.to(weight_dtype) for k, v in uncond_added_conditions.items()},
                        ).sample
-                        uncond_pred_x0 = predicted_origin(
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
                            uncond_teacher_output,
                            start_timesteps,
                            noisy_model_input,
@@ -1297,22 +1359,26 @@ def main(args):
                            sigma_schedule,
                        )

-                        # 20.4.11. Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation)
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
                        pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
-                        pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
                        x_prev = solver.ddim_step(pred_x0, pred_noise, index)

-                # 20.4.12. Get target LCM prediction on x_prev, w, c, t_n
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                with torch.no_grad():
                    with torch.autocast("cuda", dtype=weight_dtype):
                        target_noise_pred = target_unet(
                            x_prev.float(),
                            timesteps,
-                            timestep_cond=None,
+                            timestep_cond=w_embedding,
                            encoder_hidden_states=prompt_embeds.float(),
                            added_cond_kwargs=encoded_text,
                        ).sample
-                    pred_x_0 = predicted_origin(
+                    pred_x_0 = get_predicted_original_sample(
                        target_noise_pred,
                        timesteps,
                        x_prev,
@@ -1322,7 +1388,7 @@ def main(args):
                    )
                    target = c_skip * x_prev + c_out * pred_x_0

-                # 20.4.13. Calculate loss
+                # 10. Calculate loss
                if args.loss_type == "l2":
                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                elif args.loss_type == "huber":
@@ -1330,7 +1396,7 @@ def main(args):
                        torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
                    )

-                # 20.4.14. Backpropagate on the online student model (`unet`)
+                # 11. Backpropagate on the online student model (`unet`)
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
@@ -1340,7 +1406,7 @@ def main(args):

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
-                # 20.4.15. Make EMA update to target student model parameters
+                # 12. Make EMA update to target student model parameters (`target_unet`)
                update_ema(target_unet.parameters(), unet.parameters(), args.ema_decay)
                progress_bar.update(1)
                global_step += 1
@@ -1391,6 +1457,14 @@ def main(args):
        target_unet = accelerator.unwrap_model(target_unet)
        target_unet.save_pretrained(os.path.join(args.output_dir, "unet_target"))

+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
    accelerator.end_training()


--- a/examples/controlnet/test_controlnet.py
+++ b/examples/controlnet/test_controlnet.py
@@ -0,0 +1,117 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class ControlNet(ExamplesTestsAccelerate):
+    def test_controlnet_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/controlnet/train_controlnet.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_controlnet_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/controlnet/train_controlnet.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
+            --max_train_steps=6
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4", "checkpoint-6"},
+            )
+
+            resume_run_args = f"""
+            examples/controlnet/train_controlnet.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-6
+            --checkpoints_total_limit=2
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
+
+
+class ControlNetSDXL(ExamplesTestsAccelerate):
+    def test_controlnet_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/controlnet/train_controlnet_sdxl.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-xl-pipe
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet-sdxl
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
--- a/Show More
+++ b/Show More