Compare commits

..

3 Commits

Author SHA1 Message Date
Dhruv Nair
644846aa14 update 2023-12-08 12:28:50 +00:00
Dhruv Nair
6a90d8a023 Merge branch 'main' into compile-test-fix 2023-12-08 11:10:18 +00:00
Dhruv Nair
7270cbf421 update 2023-12-08 09:58:13 +00:00
56 changed files with 245 additions and 2042 deletions

View File

@@ -1,52 +0,0 @@
name: Benchmarking tests
on:
schedule:
- cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM
env:
DIFFUSERS_IS_CI: yes
HF_HOME: /mnt/cache
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
jobs:
torch_pipelines_cuda_benchmark_tests:
name: Torch Core Pipelines CUDA Benchmarking Tests
strategy:
fail-fast: false
max-parallel: 1
runs-on: [single-gpu, nvidia-gpu, a10, ci]
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Install dependencies
run: |
apt-get update && apt-get install libsndfile1-dev libgl1 -y
python -m pip install -e .[quality,test]
python -m pip install pandas
- name: Environment
run: |
python utils/print_env.py
- name: Diffusers Benchmarking
env:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
BASE_PATH: benchmark_outputs
run: |
export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: benchmark_test_reports
path: benchmarks/benchmark_outputs

View File

@@ -3,7 +3,7 @@
# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
export PYTHONPATH = src
check_dirs := examples scripts src tests utils benchmarks
check_dirs := examples scripts src tests utils
modified_only_fixup:
$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))

View File

@@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi
## Quickstart
Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 16000+ checkpoints):
Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 15000+ checkpoints):
```python
from diffusers import DiffusionPipeline
@@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
- https://github.com/deep-floyd/IF
- https://github.com/bentoml/BentoML
- https://github.com/bmaltais/kohya_ss
- +7000 other amazing GitHub repositories 💪
- +6000 other amazing GitHub repositories 💪
Thank you for using us ❤️.

View File

@@ -1,297 +0,0 @@
import os
import sys
import torch
from diffusers import (
AutoPipelineForImage2Image,
AutoPipelineForInpainting,
AutoPipelineForText2Image,
ControlNetModel,
LCMScheduler,
StableDiffusionAdapterPipeline,
StableDiffusionControlNetPipeline,
StableDiffusionXLAdapterPipeline,
StableDiffusionXLControlNetPipeline,
T2IAdapter,
WuerstchenCombinedPipeline,
)
from diffusers.utils import load_image
sys.path.append(".")
from utils import ( # noqa: E402
BASE_PATH,
PROMPT,
BenchmarkInfo,
benchmark_fn,
bytes_to_giga_bytes,
flush,
generate_csv_dict,
write_to_csv,
)
RESOLUTION_MAPPING = {
"runwayml/stable-diffusion-v1-5": (512, 512),
"lllyasviel/sd-controlnet-canny": (512, 512),
"diffusers/controlnet-canny-sdxl-1.0": (1024, 1024),
"TencentARC/t2iadapter_canny_sd14v1": (512, 512),
"TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024),
"stabilityai/stable-diffusion-2-1": (768, 768),
"stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024),
"stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
"stabilityai/sdxl-turbo": (512, 512),
}
class BaseBenchmak:
pipeline_class = None
def __init__(self, args):
super().__init__()
def run_inference(self, args):
raise NotImplementedError
def benchmark(self, args):
raise NotImplementedError
def get_result_filepath(self, args):
pipeline_class_name = str(self.pipe.__class__.__name__)
name = (
args.ckpt.replace("/", "_")
+ "_"
+ pipeline_class_name
+ f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
)
filepath = os.path.join(BASE_PATH, name)
return filepath
class TextToImageBenchmark(BaseBenchmak):
pipeline_class = AutoPipelineForText2Image
def __init__(self, args):
pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
pipe = pipe.to("cuda")
if args.run_compile:
if not isinstance(pipe, WuerstchenCombinedPipeline):
pipe.unet.to(memory_format=torch.channels_last)
print("Run torch compile")
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None:
pipe.movq.to(memory_format=torch.channels_last)
pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True)
else:
print("Run torch compile")
pipe.decoder = torch.compile(pipe.decoder, mode="reduce-overhead", fullgraph=True)
pipe.vqgan = torch.compile(pipe.vqgan, mode="reduce-overhead", fullgraph=True)
pipe.set_progress_bar_config(disable=True)
self.pipe = pipe
def run_inference(self, pipe, args):
_ = pipe(
prompt=PROMPT,
num_inference_steps=args.num_inference_steps,
num_images_per_prompt=args.batch_size,
)
def benchmark(self, args):
flush()
print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
time = benchmark_fn(self.run_inference, self.pipe, args) # in seconds.
memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs.
benchmark_info = BenchmarkInfo(time=time, memory=memory)
pipeline_class_name = str(self.pipe.__class__.__name__)
flush()
csv_dict = generate_csv_dict(
pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
)
filepath = self.get_result_filepath(args)
write_to_csv(filepath, csv_dict)
print(f"Logs written to: {filepath}")
flush()
class TurboTextToImageBenchmark(TextToImageBenchmark):
def __init__(self, args):
super().__init__(args)
def run_inference(self, pipe, args):
_ = pipe(
prompt=PROMPT,
num_inference_steps=args.num_inference_steps,
num_images_per_prompt=args.batch_size,
guidance_scale=0.0,
)
class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
lora_id = "latent-consistency/lcm-lora-sdxl"
def __init__(self, args):
super().__init__(args)
self.pipe.load_lora_weights(self.lora_id)
self.pipe.fuse_lora()
self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
def get_result_filepath(self, args):
pipeline_class_name = str(self.pipe.__class__.__name__)
name = (
self.lora_id.replace("/", "_")
+ "_"
+ pipeline_class_name
+ f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
)
filepath = os.path.join(BASE_PATH, name)
return filepath
def run_inference(self, pipe, args):
_ = pipe(
prompt=PROMPT,
num_inference_steps=args.num_inference_steps,
num_images_per_prompt=args.batch_size,
guidance_scale=1.0,
)
class ImageToImageBenchmark(TextToImageBenchmark):
pipeline_class = AutoPipelineForImage2Image
url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg"
image = load_image(url).convert("RGB")
def __init__(self, args):
super().__init__(args)
self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
def run_inference(self, pipe, args):
_ = pipe(
prompt=PROMPT,
image=self.image,
num_inference_steps=args.num_inference_steps,
num_images_per_prompt=args.batch_size,
)
class TurboImageToImageBenchmark(ImageToImageBenchmark):
def __init__(self, args):
super().__init__(args)
def run_inference(self, pipe, args):
_ = pipe(
prompt=PROMPT,
image=self.image,
num_inference_steps=args.num_inference_steps,
num_images_per_prompt=args.batch_size,
guidance_scale=0.0,
strength=0.5,
)
class InpaintingBenchmark(ImageToImageBenchmark):
pipeline_class = AutoPipelineForInpainting
mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png"
mask = load_image(mask_url).convert("RGB")
def __init__(self, args):
super().__init__(args)
self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
def run_inference(self, pipe, args):
_ = pipe(
prompt=PROMPT,
image=self.image,
mask_image=self.mask,
num_inference_steps=args.num_inference_steps,
num_images_per_prompt=args.batch_size,
)
class ControlNetBenchmark(TextToImageBenchmark):
pipeline_class = StableDiffusionControlNetPipeline
aux_network_class = ControlNetModel
root_ckpt = "runwayml/stable-diffusion-v1-5"
url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
image = load_image(url).convert("RGB")
def __init__(self, args):
aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
pipe = pipe.to("cuda")
pipe.set_progress_bar_config(disable=True)
self.pipe = pipe
if args.run_compile:
pipe.unet.to(memory_format=torch.channels_last)
pipe.controlnet.to(memory_format=torch.channels_last)
print("Run torch compile")
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
def run_inference(self, pipe, args):
_ = pipe(
prompt=PROMPT,
image=self.image,
num_inference_steps=args.num_inference_steps,
num_images_per_prompt=args.batch_size,
)
class ControlNetSDXLBenchmark(ControlNetBenchmark):
pipeline_class = StableDiffusionXLControlNetPipeline
root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
def __init__(self, args):
super().__init__(args)
class T2IAdapterBenchmark(ControlNetBenchmark):
pipeline_class = StableDiffusionAdapterPipeline
aux_network_class = T2IAdapter
root_ckpt = "CompVis/stable-diffusion-v1-4"
url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
image = load_image(url).convert("L")
def __init__(self, args):
aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16)
pipe = pipe.to("cuda")
pipe.set_progress_bar_config(disable=True)
self.pipe = pipe
if args.run_compile:
pipe.unet.to(memory_format=torch.channels_last)
pipe.adapter.to(memory_format=torch.channels_last)
print("Run torch compile")
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True)
self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark):
pipeline_class = StableDiffusionXLAdapterPipeline
root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter_sdxl.png"
image = load_image(url)
def __init__(self, args):
super().__init__(args)

View File

@@ -1,26 +0,0 @@
import argparse
import sys
sys.path.append(".")
from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark # noqa: E402
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--ckpt",
type=str,
default="lllyasviel/sd-controlnet-canny",
choices=["lllyasviel/sd-controlnet-canny", "diffusers/controlnet-canny-sdxl-1.0"],
)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--num_inference_steps", type=int, default=50)
parser.add_argument("--model_cpu_offload", action="store_true")
parser.add_argument("--run_compile", action="store_true")
args = parser.parse_args()
benchmark_pipe = (
ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args)
)
benchmark_pipe.benchmark(args)

View File

@@ -1,29 +0,0 @@
import argparse
import sys
sys.path.append(".")
from base_classes import ImageToImageBenchmark, TurboImageToImageBenchmark # noqa: E402
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--ckpt",
type=str,
default="runwayml/stable-diffusion-v1-5",
choices=[
"runwayml/stable-diffusion-v1-5",
"stabilityai/stable-diffusion-2-1",
"stabilityai/stable-diffusion-xl-refiner-1.0",
"stabilityai/sdxl-turbo",
],
)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--num_inference_steps", type=int, default=50)
parser.add_argument("--model_cpu_offload", action="store_true")
parser.add_argument("--run_compile", action="store_true")
args = parser.parse_args()
benchmark_pipe = ImageToImageBenchmark(args) if "turbo" not in args.ckpt else TurboImageToImageBenchmark(args)
benchmark_pipe.benchmark(args)

View File

@@ -1,28 +0,0 @@
import argparse
import sys
sys.path.append(".")
from base_classes import InpaintingBenchmark # noqa: E402
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--ckpt",
type=str,
default="runwayml/stable-diffusion-v1-5",
choices=[
"runwayml/stable-diffusion-v1-5",
"stabilityai/stable-diffusion-2-1",
"stabilityai/stable-diffusion-xl-base-1.0",
],
)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--num_inference_steps", type=int, default=50)
parser.add_argument("--model_cpu_offload", action="store_true")
parser.add_argument("--run_compile", action="store_true")
args = parser.parse_args()
benchmark_pipe = InpaintingBenchmark(args)
benchmark_pipe.benchmark(args)

View File

@@ -1,28 +0,0 @@
import argparse
import sys
sys.path.append(".")
from base_classes import T2IAdapterBenchmark, T2IAdapterSDXLBenchmark # noqa: E402
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--ckpt",
type=str,
default="TencentARC/t2iadapter_canny_sd14v1",
choices=["TencentARC/t2iadapter_canny_sd14v1", "TencentARC/t2i-adapter-canny-sdxl-1.0"],
)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--num_inference_steps", type=int, default=50)
parser.add_argument("--model_cpu_offload", action="store_true")
parser.add_argument("--run_compile", action="store_true")
args = parser.parse_args()
benchmark_pipe = (
T2IAdapterBenchmark(args)
if args.ckpt == "TencentARC/t2iadapter_canny_sd14v1"
else T2IAdapterSDXLBenchmark(args)
)
benchmark_pipe.benchmark(args)

View File

@@ -1,23 +0,0 @@
import argparse
import sys
sys.path.append(".")
from base_classes import LCMLoRATextToImageBenchmark # noqa: E402
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--ckpt",
type=str,
default="stabilityai/stable-diffusion-xl-base-1.0",
)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--num_inference_steps", type=int, default=4)
parser.add_argument("--model_cpu_offload", action="store_true")
parser.add_argument("--run_compile", action="store_true")
args = parser.parse_args()
benchmark_pipe = LCMLoRATextToImageBenchmark(args)
benchmark_pipe.benchmark(args)

View File

@@ -1,40 +0,0 @@
import argparse
import sys
sys.path.append(".")
from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark # noqa: E402
ALL_T2I_CKPTS = [
"runwayml/stable-diffusion-v1-5",
"segmind/SSD-1B",
"stabilityai/stable-diffusion-xl-base-1.0",
"kandinsky-community/kandinsky-2-2-decoder",
"warp-ai/wuerstchen",
"stabilityai/sdxl-turbo",
]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--ckpt",
type=str,
default="runwayml/stable-diffusion-v1-5",
choices=ALL_T2I_CKPTS,
)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--num_inference_steps", type=int, default=50)
parser.add_argument("--model_cpu_offload", action="store_true")
parser.add_argument("--run_compile", action="store_true")
args = parser.parse_args()
benchmark_cls = None
if "turbo" in args.ckpt:
benchmark_cls = TurboTextToImageBenchmark
else:
benchmark_cls = TextToImageBenchmark
benchmark_pipe = benchmark_cls(args)
benchmark_pipe.benchmark(args)

View File

@@ -1,72 +0,0 @@
import glob
import sys
import pandas as pd
from huggingface_hub import hf_hub_download, upload_file
from huggingface_hub.utils._errors import EntryNotFoundError
sys.path.append(".")
from utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv # noqa: E402
def has_previous_benchmark() -> str:
csv_path = None
try:
csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE)
except EntryNotFoundError:
csv_path = None
return csv_path
def filter_float(value):
if isinstance(value, str):
return float(value.split()[0])
return value
def push_to_hf_dataset():
all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv"))
collate_csv(all_csvs, FINAL_CSV_FILE)
# If there's an existing benchmark file, we should report the changes.
csv_path = has_previous_benchmark()
if csv_path is not None:
current_results = pd.read_csv(FINAL_CSV_FILE)
previous_results = pd.read_csv(csv_path)
numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns
numeric_columns = [
c for c in numeric_columns if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)"]
]
for column in numeric_columns:
previous_results[column] = previous_results[column].map(lambda x: filter_float(x))
# Calculate the percentage change
current_results[column] = current_results[column].astype(float)
previous_results[column] = previous_results[column].astype(float)
percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100
# Format the values with '+' or '-' sign and append to original values
current_results[column] = current_results[column].map(str) + percent_change.map(
lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
)
# There might be newly added rows. So, filter out the NaNs.
current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", ""))
# Overwrite the current result file.
current_results.to_csv(FINAL_CSV_FILE, index=False)
commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results"
upload_file(
repo_id=REPO_ID,
path_in_repo=FINAL_CSV_FILE,
path_or_fileobj=FINAL_CSV_FILE,
repo_type="dataset",
commit_message=commit_message,
)
if __name__ == "__main__":
push_to_hf_dataset()

View File

@@ -1,97 +0,0 @@
import glob
import subprocess
import sys
from typing import List
sys.path.append(".")
from benchmark_text_to_image import ALL_T2I_CKPTS # noqa: E402
PATTERN = "benchmark_*.py"
class SubprocessCallException(Exception):
pass
# Taken from `test_examples_utils.py`
def run_command(command: List[str], return_stdout=False):
"""
Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
if an error occurred while running `command`
"""
try:
output = subprocess.check_output(command, stderr=subprocess.STDOUT)
if return_stdout:
if hasattr(output, "decode"):
output = output.decode("utf-8")
return output
except subprocess.CalledProcessError as e:
raise SubprocessCallException(
f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
) from e
def main():
python_files = glob.glob(PATTERN)
for file in python_files:
print(f"****** Running file: {file} ******")
# Run with canonical settings.
if file != "benchmark_text_to_image.py":
command = f"python {file}"
run_command(command.split())
command += " --run_compile"
run_command(command.split())
# Run variants.
for file in python_files:
if file == "benchmark_text_to_image.py":
for ckpt in ALL_T2I_CKPTS:
command = f"python {file} --ckpt {ckpt}"
if "turbo" in ckpt:
command += " --num_inference_steps 1"
run_command(command.split())
command += " --run_compile"
run_command(command.split())
elif file == "benchmark_sd_img.py":
for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]:
command = f"python {file} --ckpt {ckpt}"
if ckpt == "stabilityai/sdxl-turbo":
command += " --num_inference_steps 2"
run_command(command.split())
command += " --run_compile"
run_command(command.split())
elif file == "benchmark_sd_inpainting.py":
sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
command = f"python {file} --ckpt {sdxl_ckpt}"
run_command(command.split())
command += " --run_compile"
run_command(command.split())
elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]:
sdxl_ckpt = (
"diffusers/controlnet-canny-sdxl-1.0"
if "controlnet" in file
else "TencentARC/t2i-adapter-canny-sdxl-1.0"
)
command = f"python {file} --ckpt {sdxl_ckpt}"
run_command(command.split())
command += " --run_compile"
run_command(command.split())
if __name__ == "__main__":
main()

View File

@@ -1,98 +0,0 @@
import argparse
import csv
import gc
import os
from dataclasses import dataclass
from typing import Dict, List, Union
import torch
import torch.utils.benchmark as benchmark
GITHUB_SHA = os.getenv("GITHUB_SHA", None)
BENCHMARK_FIELDS = [
"pipeline_cls",
"ckpt_id",
"batch_size",
"num_inference_steps",
"model_cpu_offload",
"run_compile",
"time (secs)",
"memory (gbs)",
"actual_gpu_memory (gbs)",
"github_sha",
]
PROMPT = "ghibli style, a fantasy landscape with castles"
BASE_PATH = os.getenv("BASE_PATH", ".")
TOTAL_GPU_MEMORY = float(os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3)))
REPO_ID = "diffusers/benchmarks"
FINAL_CSV_FILE = "collated_results.csv"
@dataclass
class BenchmarkInfo:
time: float
memory: float
def flush():
"""Wipes off memory."""
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_peak_memory_stats()
def bytes_to_giga_bytes(bytes):
return f"{(bytes / 1024 / 1024 / 1024):.3f}"
def benchmark_fn(f, *args, **kwargs):
t0 = benchmark.Timer(
stmt="f(*args, **kwargs)",
globals={"args": args, "kwargs": kwargs, "f": f},
num_threads=torch.get_num_threads(),
)
return f"{(t0.blocked_autorange().mean):.3f}"
def generate_csv_dict(
pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo
) -> Dict[str, Union[str, bool, float]]:
"""Packs benchmarking data into a dictionary for latter serialization."""
data_dict = {
"pipeline_cls": pipeline_cls,
"ckpt_id": ckpt,
"batch_size": args.batch_size,
"num_inference_steps": args.num_inference_steps,
"model_cpu_offload": args.model_cpu_offload,
"run_compile": args.run_compile,
"time (secs)": benchmark_info.time,
"memory (gbs)": benchmark_info.memory,
"actual_gpu_memory (gbs)": f"{(TOTAL_GPU_MEMORY):.3f}",
"github_sha": GITHUB_SHA,
}
return data_dict
def write_to_csv(file_name: str, data_dict: Dict[str, Union[str, bool, float]]):
"""Serializes a dictionary into a CSV file."""
with open(file_name, mode="w", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS)
writer.writeheader()
writer.writerow(data_dict)
def collate_csv(input_files: List[str], output_file: str):
"""Collates multiple identically structured CSVs into a single CSV file."""
with open(output_file, mode="w", newline="") as outfile:
writer = csv.DictWriter(outfile, fieldnames=BENCHMARK_FIELDS)
writer.writeheader()
for file in input_files:
with open(file, mode="r") as infile:
reader = csv.DictReader(infile)
for row in reader:
writer.writerow(row)

View File

@@ -18,7 +18,8 @@ limitations under the License.
Diffusers examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
for a variety of use cases involving training or fine-tuning.
**Note**: If you are looking for **official** examples on how to use `diffusers` for inference, please have a look at [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
**Note**: If you are looking for **official** examples on how to use `diffusers` for inference,
please have a look at [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
Our examples aspire to be **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**.
More specifically, this means:
@@ -26,7 +27,8 @@ More specifically, this means:
- **Self-contained**: An example script shall only depend on "pip-install-able" Python packages that can be found in a `requirements.txt` file. Example scripts shall **not** depend on any local files. This means that one can simply download an example script, *e.g.* [train_unconditional.py](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py), install the required dependencies, *e.g.* [requirements.txt](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/requirements.txt) and execute the example script.
- **Easy-to-tweak**: While we strive to present as many use cases as possible, the example scripts are just that - examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, most of the examples fully expose the preprocessing of the data and the training loop to allow you to tweak and edit them as required.
- **Beginner-friendly**: We do not aim for providing state-of-the-art training scripts for the newest models, but rather examples that can be used as a way to better understand diffusion models and how to use them with the `diffusers` library. We often purposefully leave out certain state-of-the-art methods if we consider them too complex for beginners.
- **One-purpose-only**: Examples should show one task and one task only. Even if a task is from a modeling point of view very similar, *e.g.* image super-resolution and image modification tend to use the same model and training method, we want examples to showcase only one task to keep them as readable and easy-to-understand as possible.
- **One-purpose-only**: Examples should show one task and one task only. Even if a task is from a modeling
point of view very similar, *e.g.* image super-resolution and image modification tend to use the same model and training method, we want examples to showcase only one task to keep them as readable and easy-to-understand as possible.
We provide **official** examples that cover the most popular tasks of diffusion models.
*Official* examples are **actively** maintained by the `diffusers` maintainers and we try to rigorously follow our example philosophy as defined above.

View File

@@ -135,8 +135,8 @@ from safetensors.torch import load_file
"""
diffusers_example_pivotal = f"""embedding_path = hf_hub_download(repo_id='{repo_id}', filename="embeddings.safetensors", repo_type="model")
state_dict = load_file(embedding_path)
pipeline.load_textual_inversion(state_dict["clip_l"], token=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer)
pipeline.load_textual_inversion(state_dict["clip_g"], token=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2)
pipeline.load_textual_inversion(state_dict["clip_l"], token=["<s0>", "<s1>"], text_encoder=pipe.text_encoder, tokenizer=pipe.tokenizer)
pipeline.load_textual_inversion(state_dict["clip_g"], token=["<s0>", "<s1>"], text_encoder=pipe.text_encoder_2, tokenizer=pipe.tokenizer_2)
"""
if token_abstraction_dict:
for key, value in token_abstraction_dict.items():
@@ -157,8 +157,6 @@ tags:
base_model: {base_model}
instance_prompt: {instance_prompt}
license: openrail++
widget:
- text: '{validation_prompt if validation_prompt else instance_prompt}'
---
"""

View File

@@ -48,7 +48,6 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap
| Latent Consistency Pipeline | Implementation of [Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference](https://arxiv.org/abs/2310.04378) | [Latent Consistency Pipeline](#latent-consistency-pipeline) | - | [Simian Luo](https://github.com/luosiallen) |
| Latent Consistency Img2img Pipeline | Img2img pipeline for Latent Consistency Models | [Latent Consistency Img2Img Pipeline](#latent-consistency-img2img-pipeline) | - | [Logan Zoellner](https://github.com/nagolinc) |
| Latent Consistency Interpolation Pipeline | Interpolate the latent space of Latent Consistency Models with multiple prompts | [Latent Consistency Interpolation Pipeline](#latent-consistency-interpolation-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1pK3NrLWJSiJsBynLns1K1-IDTW9zbPvl?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
| SDE Drag Pipeline | The pipeline supports drag editing of images using stochastic differential equations | [SDE Drag Pipeline](#sde-drag-pipeline) | - | [NieShen](https://github.com/NieShenRuc) [Fengqi Zhu](https://github.com/Monohydroxides) |
| Regional Prompting Pipeline | Assign multiple prompts for different regions | [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) |
| LDM3D-sr (LDM3D upscaler) | Upscale low resolution RGB and depth inputs to high resolution | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline) | - | [Estelle Aflalo](https://github.com/estelleafl) |
| AnimateDiff ControlNet Pipeline | Combines AnimateDiff with precise motion control using ControlNets | [AnimateDiff ControlNet Pipeline](#animatediff-controlnet-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SKboYeGjEQmQPWoFC0aLYpBlYdHXkvAu?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) and [Edoardo Botta](https://github.com/EdoardoBotta) |
@@ -2931,7 +2930,7 @@ The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion).
- `show_image` (`bool`, defaults to False):
Determine whether to show intermediate results during generation.
```py
```
from diffusers import DiffusionPipeline
pipe = DiffusionPipeline.from_pretrained(
@@ -2963,7 +2962,7 @@ images = pipe(
)
```
You can display and save the generated images as:
```py
```
def image_grid(imgs, save_path=None):
w = 0
@@ -2987,42 +2986,3 @@ def image_grid(imgs, save_path=None):
image_grid(images, save_path="./outputs/")
```
![output_example](https://github.com/PRIS-CV/DemoFusion/blob/main/output_example.png)
### SDE Drag pipeline
This pipeline provides drag-and-drop image editing using stochastic differential equations. It enables image editing by inputting prompt, image, mask_image, source_points, and target_points.
![SDE Drag Image](https://github.com/huggingface/diffusers/assets/75928535/bd54f52f-f002-4951-9934-b2a4592771a5)
See [paper](https://arxiv.org/abs/2311.01410), [paper page](https://ml-gsai.github.io/SDE-Drag-demo/), [original repo](https://github.com/ML-GSAI/SDE-Drag) for more infomation.
```py
import PIL
import torch
from diffusers import DDIMScheduler, DiffusionPipeline
# Load the pipeline
model_path = "runwayml/stable-diffusion-v1-5"
scheduler = DDIMScheduler.from_pretrained(model_path, subfolder="scheduler")
pipe = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, custom_pipeline="sde_drag")
pipe.to('cuda')
# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
# If not training LoRA, please avoid using torch.float16
# pipe.to(torch.float16)
# Provide prompt, image, mask image, and the starting and target points for drag editing.
prompt = "prompt of the image"
image = PIL.Image.open('/path/to/image')
mask_image = PIL.Image.open('/path/to/mask_image')
source_points = [[123, 456]]
target_points = [[234, 567]]
# train_lora is optional, and in most cases, using train_lora can better preserve consistency with the original image.
pipe.train_lora(prompt, image)
output = pipe(prompt, image, mask_image, source_points, target_points)
output_image = PIL.Image.fromarray(output)
output_image.save("./output.png")
```

View File

@@ -1,594 +0,0 @@
import math
import tempfile
from typing import List, Optional
import numpy as np
import PIL.Image
import torch
from accelerate import Accelerator
from torchvision import transforms
from tqdm.auto import tqdm
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, DiffusionPipeline, DPMSolverMultistepScheduler, UNet2DConditionModel
from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin
from diffusers.models.attention_processor import (
AttnAddedKVProcessor,
AttnAddedKVProcessor2_0,
LoRAAttnAddedKVProcessor,
LoRAAttnProcessor,
LoRAAttnProcessor2_0,
SlicedAttnAddedKVProcessor,
)
from diffusers.optimization import get_scheduler
class SdeDragPipeline(DiffusionPipeline):
r"""
Pipeline for image drag-and-drop editing using stochastic differential equations: https://arxiv.org/abs/2311.01410.
Please refer to the [official repository](https://github.com/ML-GSAI/SDE-Drag) for more information.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
text_encoder ([`CLIPTextModel`]):
Frozen text-encoder. Stable Diffusion uses the text portion of
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
tokenizer (`CLIPTokenizer`):
Tokenizer of class
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Please use
[`DDIMScheduler`].
"""
def __init__(
self,
vae: AutoencoderKL,
text_encoder: CLIPTextModel,
tokenizer: CLIPTokenizer,
unet: UNet2DConditionModel,
scheduler: DPMSolverMultistepScheduler,
):
super().__init__()
self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
@torch.no_grad()
def __call__(
self,
prompt: str,
image: PIL.Image.Image,
mask_image: PIL.Image.Image,
source_points: List[List[int]],
target_points: List[List[int]],
t0: Optional[float] = 0.6,
steps: Optional[int] = 200,
step_size: Optional[int] = 2,
image_scale: Optional[float] = 0.3,
adapt_radius: Optional[int] = 5,
min_lora_scale: Optional[float] = 0.5,
generator: Optional[torch.Generator] = None,
):
r"""
Function invoked when calling the pipeline for image editing.
Args:
prompt (`str`, *required*):
The prompt to guide the image editing.
image (`PIL.Image.Image`, *required*):
Which will be edited, parts of the image will be masked out with `mask_image` and edited
according to `prompt`.
mask_image (`PIL.Image.Image`, *required*):
To mask `image`. White pixels in the mask will be edited, while black pixels will be preserved.
source_points (`List[List[int]]`, *required*):
Used to mark the starting positions of drag editing in the image, with each pixel represented as a
`List[int]` of length 2.
target_points (`List[List[int]]`, *required*):
Used to mark the target positions of drag editing in the image, with each pixel represented as a
`List[int]` of length 2.
t0 (`float`, *optional*, defaults to 0.6):
The time parameter. Higher t0 improves the fidelity while lowering the faithfulness of the edited images
and vice versa.
steps (`int`, *optional*, defaults to 200):
The number of sampling iterations.
step_size (`int`, *optional*, defaults to 2):
The drag diatance of each drag step.
image_scale (`float`, *optional*, defaults to 0.3):
To avoid duplicating the content, use image_scale to perturbs the source.
adapt_radius (`int`, *optional*, defaults to 5):
The size of the region for copy and paste operations during each step of the drag process.
min_lora_scale (`float`, *optional*, defaults to 0.5):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
min_lora_scale specifies the minimum LoRA scale during the image drag-editing process.
generator ('torch.Generator', *optional*, defaults to None):
To make generation deterministic(https://pytorch.org/docs/stable/generated/torch.Generator.html).
Examples:
```py
>>> import PIL
>>> import torch
>>> from diffusers import DDIMScheduler, DiffusionPipeline
>>> # Load the pipeline
>>> model_path = "runwayml/stable-diffusion-v1-5"
>>> scheduler = DDIMScheduler.from_pretrained(model_path, subfolder="scheduler")
>>> pipe = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, custom_pipeline="sde_drag")
>>> pipe.to('cuda')
>>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
>>> # If not training LoRA, please avoid using torch.float16
>>> # pipe.to(torch.float16)
>>> # Provide prompt, image, mask image, and the starting and target points for drag editing.
>>> prompt = "prompt of the image"
>>> image = PIL.Image.open('/path/to/image')
>>> mask_image = PIL.Image.open('/path/to/mask_image')
>>> source_points = [[123, 456]]
>>> target_points = [[234, 567]]
>>> # train_lora is optional, and in most cases, using train_lora can better preserve consistency with the original image.
>>> pipe.train_lora(prompt, image)
>>> output = pipe(prompt, image, mask_image, source_points, target_points)
>>> output_image = PIL.Image.fromarray(output)
>>> output_image.save("./output.png")
```
"""
self.scheduler.set_timesteps(steps)
noise_scale = (1 - image_scale**2) ** (0.5)
text_embeddings = self._get_text_embed(prompt)
uncond_embeddings = self._get_text_embed([""])
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
latent = self._get_img_latent(image)
mask = mask_image.resize((latent.shape[3], latent.shape[2]))
mask = torch.tensor(np.array(mask))
mask = mask.unsqueeze(0).expand_as(latent).to(self.device)
source_points = torch.tensor(source_points).div(torch.tensor([8]), rounding_mode="trunc")
target_points = torch.tensor(target_points).div(torch.tensor([8]), rounding_mode="trunc")
distance = target_points - source_points
distance_norm_max = torch.norm(distance.float(), dim=1, keepdim=True).max()
if distance_norm_max <= step_size:
drag_num = 1
else:
drag_num = distance_norm_max.div(torch.tensor([step_size]), rounding_mode="trunc")
if (distance_norm_max / drag_num - step_size).abs() > (
distance_norm_max / (drag_num + 1) - step_size
).abs():
drag_num += 1
latents = []
for i in tqdm(range(int(drag_num)), desc="SDE Drag"):
source_new = source_points + (i / drag_num * distance).to(torch.int)
target_new = source_points + ((i + 1) / drag_num * distance).to(torch.int)
latent, noises, hook_latents, lora_scales, cfg_scales = self._forward(
latent, steps, t0, min_lora_scale, text_embeddings, generator
)
latent = self._copy_and_paste(
latent,
source_new,
target_new,
adapt_radius,
latent.shape[2] - 1,
latent.shape[3] - 1,
image_scale,
noise_scale,
generator,
)
latent = self._backward(
latent, mask, steps, t0, noises, hook_latents, lora_scales, cfg_scales, text_embeddings, generator
)
latents.append(latent)
result_image = 1 / 0.18215 * latents[-1]
with torch.no_grad():
result_image = self.vae.decode(result_image).sample
result_image = (result_image / 2 + 0.5).clamp(0, 1)
result_image = result_image.cpu().permute(0, 2, 3, 1).numpy()[0]
result_image = (result_image * 255).astype(np.uint8)
return result_image
def train_lora(self, prompt, image, lora_step=100, lora_rank=16, generator=None):
accelerator = Accelerator(gradient_accumulation_steps=1, mixed_precision="fp16")
self.vae.requires_grad_(False)
self.text_encoder.requires_grad_(False)
self.unet.requires_grad_(False)
unet_lora_attn_procs = {}
for name, attn_processor in self.unet.attn_processors.items():
cross_attention_dim = None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim
if name.startswith("mid_block"):
hidden_size = self.unet.config.block_out_channels[-1]
elif name.startswith("up_blocks"):
block_id = int(name[len("up_blocks.")])
hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id]
elif name.startswith("down_blocks"):
block_id = int(name[len("down_blocks.")])
hidden_size = self.unet.config.block_out_channels[block_id]
else:
raise NotImplementedError("name must start with up_blocks, mid_blocks, or down_blocks")
if isinstance(attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0)):
lora_attn_processor_class = LoRAAttnAddedKVProcessor
else:
lora_attn_processor_class = (
LoRAAttnProcessor2_0
if hasattr(torch.nn.functional, "scaled_dot_product_attention")
else LoRAAttnProcessor
)
unet_lora_attn_procs[name] = lora_attn_processor_class(
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=lora_rank
)
self.unet.set_attn_processor(unet_lora_attn_procs)
unet_lora_layers = AttnProcsLayers(self.unet.attn_processors)
params_to_optimize = unet_lora_layers.parameters()
optimizer = torch.optim.AdamW(
params_to_optimize,
lr=2e-4,
betas=(0.9, 0.999),
weight_decay=1e-2,
eps=1e-08,
)
lr_scheduler = get_scheduler(
"constant",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=lora_step,
num_cycles=1,
power=1.0,
)
unet_lora_layers = accelerator.prepare_model(unet_lora_layers)
optimizer = accelerator.prepare_optimizer(optimizer)
lr_scheduler = accelerator.prepare_scheduler(lr_scheduler)
with torch.no_grad():
text_inputs = self._tokenize_prompt(prompt, tokenizer_max_length=None)
text_embedding = self._encode_prompt(
text_inputs.input_ids, text_inputs.attention_mask, text_encoder_use_attention_mask=False
)
image_transforms = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize([0.5], [0.5]),
]
)
image = image_transforms(image).to(self.device, dtype=self.vae.dtype)
image = image.unsqueeze(dim=0)
latents_dist = self.vae.encode(image).latent_dist
for _ in tqdm(range(lora_step), desc="Train LoRA"):
self.unet.train()
model_input = latents_dist.sample() * self.vae.config.scaling_factor
# Sample noise that we'll add to the latents
noise = torch.randn(
model_input.size(),
dtype=model_input.dtype,
layout=model_input.layout,
device=model_input.device,
generator=generator,
)
bsz, channels, height, width = model_input.shape
# Sample a random timestep for each image
timesteps = torch.randint(
0, self.scheduler.config.num_train_timesteps, (bsz,), device=model_input.device, generator=generator
)
timesteps = timesteps.long()
# Add noise to the model input according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_model_input = self.scheduler.add_noise(model_input, noise, timesteps)
# Predict the noise residual
model_pred = self.unet(noisy_model_input, timesteps, text_embedding).sample
# Get the target for loss depending on the prediction type
if self.scheduler.config.prediction_type == "epsilon":
target = noise
elif self.scheduler.config.prediction_type == "v_prediction":
target = self.scheduler.get_velocity(model_input, noise, timesteps)
else:
raise ValueError(f"Unknown prediction type {self.scheduler.config.prediction_type}")
loss = torch.nn.functional.mse_loss(model_pred.float(), target.float(), reduction="mean")
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
with tempfile.TemporaryDirectory() as save_lora_dir:
LoraLoaderMixin.save_lora_weights(
save_directory=save_lora_dir,
unet_lora_layers=unet_lora_layers,
text_encoder_lora_layers=None,
)
self.unet.load_attn_procs(save_lora_dir)
def _tokenize_prompt(self, prompt, tokenizer_max_length=None):
if tokenizer_max_length is not None:
max_length = tokenizer_max_length
else:
max_length = self.tokenizer.model_max_length
text_inputs = self.tokenizer(
prompt,
truncation=True,
padding="max_length",
max_length=max_length,
return_tensors="pt",
)
return text_inputs
def _encode_prompt(self, input_ids, attention_mask, text_encoder_use_attention_mask=False):
text_input_ids = input_ids.to(self.device)
if text_encoder_use_attention_mask:
attention_mask = attention_mask.to(self.device)
else:
attention_mask = None
prompt_embeds = self.text_encoder(
text_input_ids,
attention_mask=attention_mask,
)
prompt_embeds = prompt_embeds[0]
return prompt_embeds
@torch.no_grad()
def _get_text_embed(self, prompt):
text_input = self.tokenizer(
prompt,
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
return text_embeddings
def _copy_and_paste(
self, latent, source_new, target_new, adapt_radius, max_height, max_width, image_scale, noise_scale, generator
):
def adaption_r(source, target, adapt_radius, max_height, max_width):
r_x_lower = min(adapt_radius, source[0], target[0])
r_x_upper = min(adapt_radius, max_width - source[0], max_width - target[0])
r_y_lower = min(adapt_radius, source[1], target[1])
r_y_upper = min(adapt_radius, max_height - source[1], max_height - target[1])
return r_x_lower, r_x_upper, r_y_lower, r_y_upper
for source_, target_ in zip(source_new, target_new):
r_x_lower, r_x_upper, r_y_lower, r_y_upper = adaption_r(
source_, target_, adapt_radius, max_height, max_width
)
source_feature = latent[
:, :, source_[1] - r_y_lower : source_[1] + r_y_upper, source_[0] - r_x_lower : source_[0] + r_x_upper
].clone()
latent[
:, :, source_[1] - r_y_lower : source_[1] + r_y_upper, source_[0] - r_x_lower : source_[0] + r_x_upper
] = image_scale * source_feature + noise_scale * torch.randn(
latent.shape[0],
4,
r_y_lower + r_y_upper,
r_x_lower + r_x_upper,
device=self.device,
generator=generator,
)
latent[
:, :, target_[1] - r_y_lower : target_[1] + r_y_upper, target_[0] - r_x_lower : target_[0] + r_x_upper
] = source_feature * 1.1
return latent
@torch.no_grad()
def _get_img_latent(self, image, height=None, weight=None):
data = image.convert("RGB")
if height is not None:
data = data.resize((weight, height))
transform = transforms.ToTensor()
data = transform(data).unsqueeze(0)
data = (data * 2.0) - 1.0
data = data.to(self.device, dtype=self.vae.dtype)
latent = self.vae.encode(data).latent_dist.sample()
latent = 0.18215 * latent
return latent
@torch.no_grad()
def _get_eps(self, latent, timestep, guidance_scale, text_embeddings, lora_scale=None):
latent_model_input = torch.cat([latent] * 2) if guidance_scale > 1.0 else latent
text_embeddings = text_embeddings if guidance_scale > 1.0 else text_embeddings.chunk(2)[1]
cross_attention_kwargs = None if lora_scale is None else {"scale": lora_scale}
with torch.no_grad():
noise_pred = self.unet(
latent_model_input,
timestep,
encoder_hidden_states=text_embeddings,
cross_attention_kwargs=cross_attention_kwargs,
).sample
if guidance_scale > 1.0:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
elif guidance_scale == 1.0:
noise_pred_text = noise_pred
noise_pred_uncond = 0.0
else:
raise NotImplementedError(guidance_scale)
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
return noise_pred
def _forward_sde(
self, timestep, sample, guidance_scale, text_embeddings, steps, eta=1.0, lora_scale=None, generator=None
):
num_train_timesteps = len(self.scheduler)
alphas_cumprod = self.scheduler.alphas_cumprod
initial_alpha_cumprod = torch.tensor(1.0)
prev_timestep = timestep + num_train_timesteps // steps
alpha_prod_t = alphas_cumprod[timestep] if timestep >= 0 else initial_alpha_cumprod
alpha_prod_t_prev = alphas_cumprod[prev_timestep]
beta_prod_t_prev = 1 - alpha_prod_t_prev
x_prev = (alpha_prod_t_prev / alpha_prod_t) ** (0.5) * sample + (1 - alpha_prod_t_prev / alpha_prod_t) ** (
0.5
) * torch.randn(
sample.size(), dtype=sample.dtype, layout=sample.layout, device=self.device, generator=generator
)
eps = self._get_eps(x_prev, prev_timestep, guidance_scale, text_embeddings, lora_scale)
sigma_t_prev = (
eta
* (1 - alpha_prod_t) ** (0.5)
* (1 - alpha_prod_t_prev / (1 - alpha_prod_t_prev) * (1 - alpha_prod_t) / alpha_prod_t) ** (0.5)
)
pred_original_sample = (x_prev - beta_prod_t_prev ** (0.5) * eps) / alpha_prod_t_prev ** (0.5)
pred_sample_direction_coeff = (1 - alpha_prod_t - sigma_t_prev**2) ** (0.5)
noise = (
sample - alpha_prod_t ** (0.5) * pred_original_sample - pred_sample_direction_coeff * eps
) / sigma_t_prev
return x_prev, noise
def _sample(
self,
timestep,
sample,
guidance_scale,
text_embeddings,
steps,
sde=False,
noise=None,
eta=1.0,
lora_scale=None,
generator=None,
):
num_train_timesteps = len(self.scheduler)
alphas_cumprod = self.scheduler.alphas_cumprod
final_alpha_cumprod = torch.tensor(1.0)
eps = self._get_eps(sample, timestep, guidance_scale, text_embeddings, lora_scale)
prev_timestep = timestep - num_train_timesteps // steps
alpha_prod_t = alphas_cumprod[timestep]
alpha_prod_t_prev = alphas_cumprod[prev_timestep] if prev_timestep >= 0 else final_alpha_cumprod
beta_prod_t = 1 - alpha_prod_t
sigma_t = (
eta
* ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** (0.5)
* (1 - alpha_prod_t / alpha_prod_t_prev) ** (0.5)
if sde
else 0
)
pred_original_sample = (sample - beta_prod_t ** (0.5) * eps) / alpha_prod_t ** (0.5)
pred_sample_direction_coeff = (1 - alpha_prod_t_prev - sigma_t**2) ** (0.5)
noise = (
torch.randn(
sample.size(), dtype=sample.dtype, layout=sample.layout, device=self.device, generator=generator
)
if noise is None
else noise
)
latent = (
alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction_coeff * eps + sigma_t * noise
)
return latent
def _forward(self, latent, steps, t0, lora_scale_min, text_embeddings, generator):
def scale_schedule(begin, end, n, length, type="linear"):
if type == "constant":
return end
elif type == "linear":
return begin + (end - begin) * n / length
elif type == "cos":
factor = (1 - math.cos(n * math.pi / length)) / 2
return (1 - factor) * begin + factor * end
else:
raise NotImplementedError(type)
noises = []
latents = []
lora_scales = []
cfg_scales = []
latents.append(latent)
t0 = int(t0 * steps)
t_begin = steps - t0
length = len(self.scheduler.timesteps[t_begin - 1 : -1]) - 1
index = 1
for t in self.scheduler.timesteps[t_begin:].flip(dims=[0]):
lora_scale = scale_schedule(1, lora_scale_min, index, length, type="cos")
cfg_scale = scale_schedule(1, 3.0, index, length, type="linear")
latent, noise = self._forward_sde(
t, latent, cfg_scale, text_embeddings, steps, lora_scale=lora_scale, generator=generator
)
noises.append(noise)
latents.append(latent)
lora_scales.append(lora_scale)
cfg_scales.append(cfg_scale)
index += 1
return latent, noises, latents, lora_scales, cfg_scales
def _backward(
self, latent, mask, steps, t0, noises, hook_latents, lora_scales, cfg_scales, text_embeddings, generator
):
t0 = int(t0 * steps)
t_begin = steps - t0
hook_latent = hook_latents.pop()
latent = torch.where(mask > 128, latent, hook_latent)
for t in self.scheduler.timesteps[t_begin - 1 : -1]:
latent = self._sample(
t,
latent,
cfg_scales.pop(),
text_embeddings,
steps,
sde=True,
noise=noises.pop(),
lora_scale=lora_scales.pop(),
generator=generator,
)
hook_latent = hook_latents.pop()
latent = torch.where(mask > 128, latent, hook_latent)
return latent

View File

@@ -1,6 +1,6 @@
## [Deprecated] Multi Token Textual Inversion
**IMPORTART: This research project is deprecated. Multi Token Textual Inversion is now supported natively in [the official textual inversion example](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion#running-locally-with-pytorch).**
**IMPORTART: This research project is deprecated. Multi Token Textual Inversion is now supported natively in [the officail textual inversion example](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion#running-locally-with-pytorch).**
The author of this project is [Isamu Isozaki](https://github.com/isamu-isozaki) - please make sure to tag the author for issue and PRs as well as @patrickvonplaten.

View File

@@ -204,7 +204,7 @@ class DepsTableUpdateCommand(Command):
extras = {}
extras["quality"] = deps_list("urllib3", "isort", "ruff", "hf-doc-builder")
extras["docs"] = deps_list("hf-doc-builder")
extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft")
extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2")
extras["test"] = deps_list(
"compel",
"GitPython",

View File

@@ -88,7 +88,7 @@ class VaeImageProcessor(ConfigMixin):
self.config.do_convert_rgb = False
@staticmethod
def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image:
"""
Convert a numpy image or a batch of images to a PIL image.
"""

View File

@@ -19,10 +19,10 @@ import numpy as np
import PIL.Image
import torch
import torch.nn.functional as F
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
@@ -130,7 +130,7 @@ def prepare_image(image):
class StableDiffusionControlNetImg2ImgPipeline(
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
):
r"""
Pipeline for image-to-image generation using Stable Diffusion with ControlNet guidance.
@@ -140,7 +140,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
The pipeline also inherits the following loading methods:
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
@@ -166,7 +166,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_optional_components = ["safety_checker", "feature_extractor"]
_exclude_from_cpu_offload = ["safety_checker"]
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
@@ -180,7 +180,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
scheduler: KarrasDiffusionSchedulers,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
image_encoder: CLIPVisionModelWithProjection = None,
requires_safety_checker: bool = True,
):
super().__init__()
@@ -213,7 +212,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
@@ -470,31 +468,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
return prompt_embeds, negative_prompt_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
dtype = next(self.image_encoder.parameters()).dtype
if not isinstance(image, torch.Tensor):
image = self.feature_extractor(image, return_tensors="pt").pixel_values
image = image.to(device=device, dtype=dtype)
if output_hidden_states:
image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_enc_hidden_states = self.image_encoder(
torch.zeros_like(image), output_hidden_states=True
).hidden_states[-2]
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
num_images_per_prompt, dim=0
)
return image_enc_hidden_states, uncond_image_enc_hidden_states
else:
image_embeds = self.image_encoder(image).image_embeds
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_embeds = torch.zeros_like(image_embeds)
return image_embeds, uncond_image_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
@@ -888,7 +861,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -950,7 +922,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
@@ -1082,11 +1053,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
if self.do_classifier_free_guidance:
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
if ip_adapter_image is not None:
image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
if self.do_classifier_free_guidance:
image_embeds = torch.cat([negative_image_embeds, image_embeds])
# 4. Prepare image
image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
@@ -1145,10 +1111,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
# 7.2 Create tensor stating which controlnets to keep
# 7.1 Create tensor stating which controlnets to keep
controlnet_keep = []
for i in range(len(timesteps)):
keeps = [
@@ -1208,7 +1171,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
cross_attention_kwargs=self.cross_attention_kwargs,
down_block_additional_residuals=down_block_res_samples,
mid_block_additional_residual=mid_block_res_sample,
added_cond_kwargs=added_cond_kwargs,
return_dict=False,
)[0]

View File

@@ -20,11 +20,11 @@ from typing import Any, Callable, Dict, List, Optional, Union
import PIL.Image
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import LCMScheduler
from ...utils import (
@@ -129,7 +129,7 @@ EXAMPLE_DOC_STRING = """
class LatentConsistencyModelImg2ImgPipeline(
DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
):
r"""
Pipeline for image-to-image generation using a latent consistency model.
@@ -142,7 +142,6 @@ class LatentConsistencyModelImg2ImgPipeline(
- [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
- [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
- [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
Args:
vae ([`AutoencoderKL`]):
@@ -167,7 +166,7 @@ class LatentConsistencyModelImg2ImgPipeline(
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_optional_components = ["safety_checker", "feature_extractor"]
_exclude_from_cpu_offload = ["safety_checker"]
_callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"]
@@ -180,7 +179,6 @@ class LatentConsistencyModelImg2ImgPipeline(
scheduler: LCMScheduler,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
image_encoder: Optional[CLIPVisionModelWithProjection] = None,
requires_safety_checker: bool = True,
):
super().__init__()
@@ -193,7 +191,6 @@ class LatentConsistencyModelImg2ImgPipeline(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
)
if safety_checker is None and requires_safety_checker:
@@ -452,31 +449,6 @@ class LatentConsistencyModelImg2ImgPipeline(
return prompt_embeds, negative_prompt_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
dtype = next(self.image_encoder.parameters()).dtype
if not isinstance(image, torch.Tensor):
image = self.feature_extractor(image, return_tensors="pt").pixel_values
image = image.to(device=device, dtype=dtype)
if output_hidden_states:
image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_enc_hidden_states = self.image_encoder(
torch.zeros_like(image), output_hidden_states=True
).hidden_states[-2]
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
num_images_per_prompt, dim=0
)
return image_enc_hidden_states, uncond_image_enc_hidden_states
else:
image_embeds = self.image_encoder(image).image_embeds
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_embeds = torch.zeros_like(image_embeds)
return image_embeds, uncond_image_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
@@ -675,7 +647,6 @@ class LatentConsistencyModelImg2ImgPipeline(
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -724,8 +695,6 @@ class LatentConsistencyModelImg2ImgPipeline(
prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
@@ -789,12 +758,6 @@ class LatentConsistencyModelImg2ImgPipeline(
device = self._execution_device
# do_classifier_free_guidance = guidance_scale > 1.0
if ip_adapter_image is not None:
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
image_embeds, negative_image_embeds = self.encode_image(
ip_adapter_image, device, num_images_per_prompt, output_hidden_state
)
# 3. Encode input prompt
lora_scale = (
self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
@@ -852,9 +815,6 @@ class LatentConsistencyModelImg2ImgPipeline(
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)
# 7.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
# 8. LCM Multistep Sampling Loop
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
self._num_timesteps = len(timesteps)
@@ -869,7 +829,6 @@ class LatentConsistencyModelImg2ImgPipeline(
timestep_cond=w_embedding,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=self.cross_attention_kwargs,
added_cond_kwargs=added_cond_kwargs,
return_dict=False,
)[0]

View File

@@ -19,11 +19,11 @@ import inspect
from typing import Any, Callable, Dict, List, Optional, Union
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from ...image_processor import VaeImageProcessor
from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import LCMScheduler
from ...utils import (
@@ -107,7 +107,7 @@ def retrieve_timesteps(
class LatentConsistencyModelPipeline(
DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
):
r"""
Pipeline for text-to-image generation using a latent consistency model.
@@ -120,7 +120,6 @@ class LatentConsistencyModelPipeline(
- [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
- [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
- [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
Args:
vae ([`AutoencoderKL`]):
@@ -145,7 +144,7 @@ class LatentConsistencyModelPipeline(
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_optional_components = ["safety_checker", "feature_extractor"]
_exclude_from_cpu_offload = ["safety_checker"]
_callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"]
@@ -158,7 +157,6 @@ class LatentConsistencyModelPipeline(
scheduler: LCMScheduler,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
image_encoder: Optional[CLIPVisionModelWithProjection] = None,
requires_safety_checker: bool = True,
):
super().__init__()
@@ -187,7 +185,6 @@ class LatentConsistencyModelPipeline(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -436,31 +433,6 @@ class LatentConsistencyModelPipeline(
return prompt_embeds, negative_prompt_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
dtype = next(self.image_encoder.parameters()).dtype
if not isinstance(image, torch.Tensor):
image = self.feature_extractor(image, return_tensors="pt").pixel_values
image = image.to(device=device, dtype=dtype)
if output_hidden_states:
image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_enc_hidden_states = self.image_encoder(
torch.zeros_like(image), output_hidden_states=True
).hidden_states[-2]
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
num_images_per_prompt, dim=0
)
return image_enc_hidden_states, uncond_image_enc_hidden_states
else:
image_embeds = self.image_encoder(image).image_embeds
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_embeds = torch.zeros_like(image_embeds)
return image_embeds, uncond_image_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
@@ -609,7 +581,6 @@ class LatentConsistencyModelPipeline(
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -658,8 +629,6 @@ class LatentConsistencyModelPipeline(
prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
@@ -728,12 +697,6 @@ class LatentConsistencyModelPipeline(
device = self._execution_device
# do_classifier_free_guidance = guidance_scale > 1.0
if ip_adapter_image is not None:
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
image_embeds, negative_image_embeds = self.encode_image(
ip_adapter_image, device, num_images_per_prompt, output_hidden_state
)
# 3. Encode input prompt
lora_scale = (
self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
@@ -785,9 +748,6 @@ class LatentConsistencyModelPipeline(
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)
# 7.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
# 8. LCM MultiStep Sampling Loop:
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
self._num_timesteps = len(timesteps)
@@ -802,7 +762,6 @@ class LatentConsistencyModelPipeline(
timestep_cond=w_embedding,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=self.cross_attention_kwargs,
added_cond_kwargs=added_cond_kwargs,
return_dict=False,
)[0]

View File

@@ -29,7 +29,7 @@ if is_onnx_available():
from ..onnx_utils import OnnxRuntimeModel
from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
from .continuous_encoder import SpectrogramContEncoder
from .continous_encoder import SpectrogramContEncoder
from .notes_encoder import SpectrogramNotesEncoder

View File

@@ -18,11 +18,11 @@ from typing import Callable, Dict, List, Optional, Union
import numpy as np
import PIL.Image
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import PIL_INTERPOLATION, deprecate, logging
from ...utils.torch_utils import randn_tensor
@@ -72,9 +72,7 @@ def retrieve_latents(
raise AttributeError("Could not access latents of provided encoder_output")
class StableDiffusionInstructPix2PixPipeline(
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin
):
class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
r"""
Pipeline for pixel-level image editing by following text instructions (based on Stable Diffusion).
@@ -85,7 +83,6 @@ class StableDiffusionInstructPix2PixPipeline(
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
- [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
- [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
Args:
vae ([`AutoencoderKL`]):
@@ -108,7 +105,7 @@ class StableDiffusionInstructPix2PixPipeline(
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_optional_components = ["safety_checker", "feature_extractor"]
_exclude_from_cpu_offload = ["safety_checker"]
_callback_tensor_inputs = ["latents", "prompt_embeds", "image_latents"]
@@ -121,7 +118,6 @@ class StableDiffusionInstructPix2PixPipeline(
scheduler: KarrasDiffusionSchedulers,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
image_encoder: Optional[CLIPVisionModelWithProjection] = None,
requires_safety_checker: bool = True,
):
super().__init__()
@@ -150,7 +146,6 @@ class StableDiffusionInstructPix2PixPipeline(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -171,7 +166,6 @@ class StableDiffusionInstructPix2PixPipeline(
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -219,8 +213,6 @@ class StableDiffusionInstructPix2PixPipeline(
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
@@ -301,16 +293,6 @@ class StableDiffusionInstructPix2PixPipeline(
self._guidance_scale = guidance_scale
self._image_guidance_scale = image_guidance_scale
device = self._execution_device
if ip_adapter_image is not None:
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
image_embeds, negative_image_embeds = self.encode_image(
ip_adapter_image, device, num_images_per_prompt, output_hidden_state
)
if self.do_classifier_free_guidance:
image_embeds = torch.cat([image_embeds, negative_image_embeds, negative_image_embeds])
if image is None:
raise ValueError("`image` input cannot be undefined.")
@@ -385,9 +367,6 @@ class StableDiffusionInstructPix2PixPipeline(
# 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 8.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
# 9. Denoising loop
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
self._num_timesteps = len(timesteps)
@@ -404,11 +383,7 @@ class StableDiffusionInstructPix2PixPipeline(
# predict the noise residual
noise_pred = self.unet(
scaled_latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
added_cond_kwargs=added_cond_kwargs,
return_dict=False,
scaled_latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False
)[0]
# Hack:
@@ -628,31 +603,6 @@ class StableDiffusionInstructPix2PixPipeline(
return prompt_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
dtype = next(self.image_encoder.parameters()).dtype
if not isinstance(image, torch.Tensor):
image = self.feature_extractor(image, return_tensors="pt").pixel_values
image = image.to(device=device, dtype=dtype)
if output_hidden_states:
image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_enc_hidden_states = self.image_encoder(
torch.zeros_like(image), output_hidden_states=True
).hidden_states[-2]
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
num_images_per_prompt, dim=0
)
return image_enc_hidden_states, uncond_image_enc_hidden_states
else:
image_embeds = self.image_encoder(image).image_embeds
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_embeds = torch.zeros_like(image_embeds)
return image_embeds, uncond_image_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:

View File

@@ -19,11 +19,11 @@ from typing import Any, Callable, Dict, List, Optional, Union
import numpy as np
import PIL.Image
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...image_processor import PipelineImageInput, VaeImageProcessorLDM3D
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from ...image_processor import VaeImageProcessorLDM3D
from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import (
@@ -82,7 +82,7 @@ class LDM3DPipelineOutput(BaseOutput):
class StableDiffusionLDM3DPipeline(
DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
):
r"""
Pipeline for text-to-image and 3D generation using LDM3D.
@@ -95,7 +95,6 @@ class StableDiffusionLDM3DPipeline(
- [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
- [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
- [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
Args:
vae ([`AutoencoderKL`]):
@@ -118,7 +117,7 @@ class StableDiffusionLDM3DPipeline(
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_optional_components = ["safety_checker", "feature_extractor"]
_exclude_from_cpu_offload = ["safety_checker"]
def __init__(
@@ -130,7 +129,6 @@ class StableDiffusionLDM3DPipeline(
scheduler: KarrasDiffusionSchedulers,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
image_encoder: Optional[CLIPVisionModelWithProjection],
requires_safety_checker: bool = True,
):
super().__init__()
@@ -159,7 +157,6 @@ class StableDiffusionLDM3DPipeline(
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor)
@@ -413,31 +410,6 @@ class StableDiffusionLDM3DPipeline(
return prompt_embeds, negative_prompt_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
dtype = next(self.image_encoder.parameters()).dtype
if not isinstance(image, torch.Tensor):
image = self.feature_extractor(image, return_tensors="pt").pixel_values
image = image.to(device=device, dtype=dtype)
if output_hidden_states:
image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_enc_hidden_states = self.image_encoder(
torch.zeros_like(image), output_hidden_states=True
).hidden_states[-2]
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
num_images_per_prompt, dim=0
)
return image_enc_hidden_states, uncond_image_enc_hidden_states
else:
image_embeds = self.image_encoder(image).image_embeds
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_embeds = torch.zeros_like(image_embeds)
return image_embeds, uncond_image_embeds
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
has_nsfw_concept = None
@@ -557,7 +529,6 @@ class StableDiffusionLDM3DPipeline(
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -602,8 +573,6 @@ class StableDiffusionLDM3DPipeline(
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
@@ -653,14 +622,6 @@ class StableDiffusionLDM3DPipeline(
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
if ip_adapter_image is not None:
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
image_embeds, negative_image_embeds = self.encode_image(
ip_adapter_image, device, num_images_per_prompt, output_hidden_state
)
if do_classifier_free_guidance:
image_embeds = torch.cat([negative_image_embeds, image_embeds])
# 3. Encode input prompt
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
prompt,
@@ -698,9 +659,6 @@ class StableDiffusionLDM3DPipeline(
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 6.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
# 7. Denoising loop
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -715,7 +673,6 @@ class StableDiffusionLDM3DPipeline(
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
added_cond_kwargs=added_cond_kwargs,
return_dict=False,
)[0]

View File

@@ -16,11 +16,11 @@ import inspect
from typing import Any, Callable, Dict, List, Optional, Union
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from ...image_processor import VaeImageProcessor
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import DDIMScheduler
from ...utils import (
@@ -59,19 +59,13 @@ EXAMPLE_DOC_STRING = """
"""
class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin):
class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
r"""
Pipeline for text-to-image generation using MultiDiffusion.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
The pipeline also inherits the following loading methods:
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
- [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
- [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
@@ -93,7 +87,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_optional_components = ["safety_checker", "feature_extractor"]
_exclude_from_cpu_offload = ["safety_checker"]
def __init__(
@@ -105,7 +99,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
scheduler: DDIMScheduler,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
image_encoder: Optional[CLIPVisionModelWithProjection] = None,
requires_safety_checker: bool = True,
):
super().__init__()
@@ -134,7 +127,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -371,31 +363,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
return prompt_embeds, negative_prompt_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
dtype = next(self.image_encoder.parameters()).dtype
if not isinstance(image, torch.Tensor):
image = self.feature_extractor(image, return_tensors="pt").pixel_values
image = image.to(device=device, dtype=dtype)
if output_hidden_states:
image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_enc_hidden_states = self.image_encoder(
torch.zeros_like(image), output_hidden_states=True
).hidden_states[-2]
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
num_images_per_prompt, dim=0
)
return image_enc_hidden_states, uncond_image_enc_hidden_states
else:
image_embeds = self.image_encoder(image).image_embeds
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_embeds = torch.zeros_like(image_embeds)
return image_embeds, uncond_image_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
@@ -562,7 +529,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -612,8 +578,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
@@ -668,14 +632,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
if ip_adapter_image is not None:
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
image_embeds, negative_image_embeds = self.encode_image(
ip_adapter_image, device, num_images_per_prompt, output_hidden_state
)
if do_classifier_free_guidance:
image_embeds = torch.cat([negative_image_embeds, image_embeds])
# 3. Encode input prompt
text_encoder_lora_scale = (
cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
@@ -725,9 +681,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
# 8. Denoising loop
# Each denoising step also includes refinement of the latents with respect to the
# views.
@@ -790,7 +743,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
t,
encoder_hidden_states=prompt_embeds_input,
cross_attention_kwargs=cross_attention_kwargs,
added_cond_kwargs=added_cond_kwargs,
).sample
# perform guidance

View File

@@ -17,11 +17,11 @@ from typing import Any, Callable, Dict, List, Optional, Union
import torch
import torch.nn.functional as F
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from ...image_processor import VaeImageProcessor
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import (
@@ -98,17 +98,13 @@ class CrossAttnStoreProcessor:
# Modified to get self-attention guidance scale in this paper (https://arxiv.org/pdf/2210.00939.pdf) as an input
class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin):
class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
The pipeline also inherits the following loading methods:
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
@@ -130,7 +126,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_optional_components = ["safety_checker", "feature_extractor"]
_exclude_from_cpu_offload = ["safety_checker"]
def __init__(
@@ -142,7 +138,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
scheduler: KarrasDiffusionSchedulers,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
image_encoder: Optional[CLIPVisionModelWithProjection] = None,
requires_safety_checker: bool = True,
):
super().__init__()
@@ -155,7 +150,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -392,31 +386,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
return prompt_embeds, negative_prompt_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
dtype = next(self.image_encoder.parameters()).dtype
if not isinstance(image, torch.Tensor):
image = self.feature_extractor(image, return_tensors="pt").pixel_values
image = image.to(device=device, dtype=dtype)
if output_hidden_states:
image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_enc_hidden_states = self.image_encoder(
torch.zeros_like(image), output_hidden_states=True
).hidden_states[-2]
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
num_images_per_prompt, dim=0
)
return image_enc_hidden_states, uncond_image_enc_hidden_states
else:
image_embeds = self.image_encoder(image).image_embeds
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_embeds = torch.zeros_like(image_embeds)
return image_embeds, uncond_image_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
@@ -550,7 +519,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -597,8 +565,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
@@ -652,14 +618,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
# `sag_scale = 0` means no self-attention guidance
do_self_attention_guidance = sag_scale > 0.0
if ip_adapter_image is not None:
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
image_embeds, negative_image_embeds = self.encode_image(
ip_adapter_image, device, num_images_per_prompt, output_hidden_state
)
if do_classifier_free_guidance:
image_embeds = torch.cat([negative_image_embeds, image_embeds])
# 3. Encode input prompt
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
prompt,
@@ -697,10 +655,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 6.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
added_uncond_kwargs = {"image_embeds": negative_image_embeds} if ip_adapter_image is not None else None
# 7. Denoising loop
store_processor = CrossAttnStoreProcessor()
self.unet.mid_block.attentions[0].transformer_blocks[0].attn1.processor = store_processor
@@ -726,7 +680,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
added_cond_kwargs=added_cond_kwargs,
).sample
# perform guidance
@@ -750,12 +703,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
)
uncond_emb, _ = prompt_embeds.chunk(2)
# forward and give guidance
degraded_pred = self.unet(
degraded_latents,
t,
encoder_hidden_states=uncond_emb,
added_cond_kwargs=added_uncond_kwargs,
).sample
degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=uncond_emb).sample
noise_pred += sag_scale * (noise_pred_uncond - degraded_pred)
else:
# DDIM-like prediction of x0
@@ -767,12 +715,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
pred_x0, cond_attn, map_size, t, self.pred_epsilon(latents, noise_pred, t)
)
# forward and give guidance
degraded_pred = self.unet(
degraded_latents,
t,
encoder_hidden_states=prompt_embeds,
added_cond_kwargs=added_cond_kwargs,
).sample
degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=prompt_embeds).sample
noise_pred += sag_scale * (noise_pred - degraded_pred)
# compute the previous noisy sample x_t -> x_t-1

View File

@@ -5,12 +5,10 @@ from typing import Callable, List, Optional, Union
import numpy as np
import torch
from packaging import version
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
from ...image_processor import PipelineImageInput
from ...loaders import IPAdapterMixin
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import deprecate, logging
from ...utils.torch_utils import randn_tensor
@@ -22,16 +20,13 @@ from .safety_checker import SafeStableDiffusionSafetyChecker
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
class StableDiffusionPipelineSafe(DiffusionPipeline):
r"""
Pipeline based on the [`StableDiffusionPipeline`] for text-to-image generation using Safe Latent Diffusion.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
The pipeline also inherits the following loading methods:
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
@@ -53,7 +48,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
self,
@@ -64,7 +59,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
scheduler: KarrasDiffusionSchedulers,
safety_checker: SafeStableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
image_encoder: Optional[CLIPVisionModelWithProjection] = None,
requires_safety_checker: bool = True,
):
super().__init__()
@@ -146,7 +140,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
)
self._safety_text_concept = safety_concept
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
@@ -474,31 +467,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
noise_guidance = noise_guidance - noise_guidance_safety
return noise_guidance, safety_momentum
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
dtype = next(self.image_encoder.parameters()).dtype
if not isinstance(image, torch.Tensor):
image = self.feature_extractor(image, return_tensors="pt").pixel_values
image = image.to(device=device, dtype=dtype)
if output_hidden_states:
image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_enc_hidden_states = self.image_encoder(
torch.zeros_like(image), output_hidden_states=True
).hidden_states[-2]
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
num_images_per_prompt, dim=0
)
return image_enc_hidden_states, uncond_image_enc_hidden_states
else:
image_embeds = self.image_encoder(image).image_embeds
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_embeds = torch.zeros_like(image_embeds)
return image_embeds, uncond_image_embeds
@torch.no_grad()
def __call__(
self,
@@ -512,7 +480,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -554,8 +521,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor is generated by sampling using the supplied random `generator`.
ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
@@ -623,17 +588,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
if not enable_safety_guidance:
warnings.warn("Safety checker disabled!")
if ip_adapter_image is not None:
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
image_embeds, negative_image_embeds = self.encode_image(
ip_adapter_image, device, num_images_per_prompt, output_hidden_state
)
if do_classifier_free_guidance:
if enable_safety_guidance:
image_embeds = torch.cat([negative_image_embeds, image_embeds, image_embeds])
else:
image_embeds = torch.cat([negative_image_embeds, image_embeds])
# 3. Encode input prompt
prompt_embeds = self._encode_prompt(
prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, enable_safety_guidance
@@ -659,9 +613,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
# 6. Prepare extra step kwargs.
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 6.1 Add image embeds for IP-Adapter
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
safety_momentum = None
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
@@ -676,9 +627,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input, t, encoder_hidden_states=prompt_embeds, added_cond_kwargs=added_cond_kwargs
).sample
noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
# perform guidance
if do_classifier_free_guidance:

View File

@@ -820,7 +820,9 @@ def _is_torch_fp16_available(device):
try:
x = torch.zeros((2, 2), dtype=torch.float16).to(device)
_ = x @ x
_ = torch.mul(x, x)
return True
except Exception as e:
if device.type == "cuda":
raise ValueError(
@@ -838,7 +840,9 @@ def _is_torch_fp64_available(device):
try:
x = torch.zeros((2, 2), dtype=torch.float64).to(device)
_ = x @ x
_ = torch.mul(x, x)
return True
except Exception as e:
if device.type == "cuda":
raise ValueError(

View File

@@ -134,7 +134,6 @@ class ControlNetImg2ImgPipelineFastTests(
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": None,
}
return components
@@ -274,7 +273,6 @@ class StableDiffusionMultiControlNetPipelineFastTests(
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": None,
}
return components

View File

@@ -87,7 +87,6 @@ class LatentConsistencyModelPipelineFastTests(PipelineLatentTesterMixin, Pipelin
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": None,
"requires_safety_checker": False,
}
return components

View File

@@ -97,7 +97,6 @@ class LatentConsistencyModelImg2ImgPipelineFastTests(
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": None,
"requires_safety_checker": False,
}
return components

View File

@@ -108,7 +108,6 @@ class StableDiffusionInstructPix2PixPipelineFastTests(
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": None,
}
return components

View File

@@ -93,7 +93,6 @@ class StableDiffusionLDM3DPipelineFastTests(unittest.TestCase):
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": None,
}
return components

View File

@@ -91,7 +91,6 @@ class StableDiffusionPanoramaPipelineFastTests(PipelineLatentTesterMixin, Pipeli
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": None,
}
return components

View File

@@ -93,7 +93,6 @@ class StableDiffusionSAGPipelineFastTests(PipelineLatentTesterMixin, PipelineTes
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": None,
}
return components