mirror of
https://github.com/huggingface/diffusers.git
synced 2025-12-06 12:34:13 +08:00
Compare commits
178 Commits
controlnet
...
fix-model-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a75bc61b71 | ||
|
|
0d08370263 | ||
|
|
b8ccb46259 | ||
|
|
8de27500da | ||
|
|
725ead2f5e | ||
|
|
5575303be0 | ||
|
|
235ff8808f | ||
|
|
26a7851e1e | ||
|
|
3fd31eef51 | ||
|
|
b02e2113ff | ||
|
|
21f023ec1a | ||
|
|
31d9f9ea77 | ||
|
|
f53352f750 | ||
|
|
83ae24ce2d | ||
|
|
8af793b2d4 | ||
|
|
eb96ff0d59 | ||
|
|
a38dd79512 | ||
|
|
b1c5817a89 | ||
|
|
235d34cf56 | ||
|
|
5029673987 | ||
|
|
56bd7e67c2 | ||
|
|
9d16daaf64 | ||
|
|
8e4ca1b6b2 | ||
|
|
0d2d424fbe | ||
|
|
e24e54fdfa | ||
|
|
ebc99a77aa | ||
|
|
fa750a15bd | ||
|
|
181688012a | ||
|
|
142f353e1c | ||
|
|
b833d0fc80 | ||
|
|
e963621649 | ||
|
|
39215aa30e | ||
|
|
9ef43f38d4 | ||
|
|
88018fcf20 | ||
|
|
7404f1e9dc | ||
|
|
5a69227863 | ||
|
|
fc9fecc217 | ||
|
|
065f251766 | ||
|
|
21c747fa0f | ||
|
|
09129842e7 | ||
|
|
33b363edfa | ||
|
|
a9dd86029e | ||
|
|
9100652494 | ||
|
|
d1e3f489e9 | ||
|
|
ae05050db9 | ||
|
|
db969cc16d | ||
|
|
3cfe187dc7 | ||
|
|
90250d9e48 | ||
|
|
e5674015f3 | ||
|
|
b5c8b555d7 | ||
|
|
e23c27e905 | ||
|
|
7635d3d37f | ||
|
|
9132ce7c58 | ||
|
|
30c977d1f5 | ||
|
|
f0fa17dd8e | ||
|
|
c726d02beb | ||
|
|
a68503f221 | ||
|
|
9d50f7eec1 | ||
|
|
fda1531d8a | ||
|
|
cf6e0407e0 | ||
|
|
1c000d46e1 | ||
|
|
08bf754507 | ||
|
|
2f23437618 | ||
|
|
2523390c26 | ||
|
|
279de3c3ff | ||
|
|
8e14535708 | ||
|
|
0bee4d336b | ||
|
|
42f25d601a | ||
|
|
33c5d125cb | ||
|
|
aa1f00fd01 | ||
|
|
d95b993427 | ||
|
|
1d480298c1 | ||
|
|
b2323aa2b7 | ||
|
|
37e9d695af | ||
|
|
a402431de0 | ||
|
|
b99b1617cf | ||
|
|
3e4a6bd2d4 | ||
|
|
c827e94da0 | ||
|
|
44f6b859bf | ||
|
|
ac7ff7d4a3 | ||
|
|
a0cf607667 | ||
|
|
a341b536a8 | ||
|
|
8e46d97cd8 | ||
|
|
7e808e768a | ||
|
|
7e39516627 | ||
|
|
56a76082ed | ||
|
|
6133d98ff7 | ||
|
|
1c60e094de | ||
|
|
71f49a5d2a | ||
|
|
35db2fdea9 | ||
|
|
ad55ce6100 | ||
|
|
a9a5b14f35 | ||
|
|
aa19025989 | ||
|
|
19ab04ff56 | ||
|
|
4a34307702 | ||
|
|
8e963d1c2a | ||
|
|
2b04ec2ff7 | ||
|
|
000fa82a1e | ||
|
|
5d83f50c23 | ||
|
|
5d21d4a204 | ||
|
|
73ba81090e | ||
|
|
7956c36aaa | ||
|
|
5266ab7935 | ||
|
|
7f724a930e | ||
|
|
9bef9f4be7 | ||
|
|
7aa4514260 | ||
|
|
c2e87869be | ||
|
|
ca61287daa | ||
|
|
f0c81562a4 | ||
|
|
9d20ed37a2 | ||
|
|
bda1d4faf8 | ||
|
|
77103d71ca | ||
|
|
0302446819 | ||
|
|
4d39b7483d | ||
|
|
fac761694a | ||
|
|
34c90dbb31 | ||
|
|
e49c04d5d6 | ||
|
|
f238cb0736 | ||
|
|
d78acdedc1 | ||
|
|
6df103deba | ||
|
|
73f28708be | ||
|
|
0cbc78f04c | ||
|
|
0cc5630945 | ||
|
|
0b8e29289d | ||
|
|
ab38ddf64f | ||
|
|
ead82fedea | ||
|
|
45b42d1203 | ||
|
|
5199ee4f7b | ||
|
|
544710ef0f | ||
|
|
443aa14e41 | ||
|
|
288632adf6 | ||
|
|
5ce79cbded | ||
|
|
d52f3e30f8 | ||
|
|
699dfb084c | ||
|
|
484c8ef399 | ||
|
|
0dd0528851 | ||
|
|
1cd4732e7f | ||
|
|
a51b6cc86a | ||
|
|
3bce0f3da1 | ||
|
|
9a34953823 | ||
|
|
e29f16cfaa | ||
|
|
f7dfcfd971 | ||
|
|
3c67864c5a | ||
|
|
363699044e | ||
|
|
9613576191 | ||
|
|
e4356d6488 | ||
|
|
82441460ef | ||
|
|
3e1097cb63 | ||
|
|
78990dd960 | ||
|
|
405a1facd2 | ||
|
|
3028089e5e | ||
|
|
b536f39818 | ||
|
|
e25e525fde | ||
|
|
de9adb907c | ||
|
|
bf861e65dc | ||
|
|
4da810b943 | ||
|
|
161c6e14b6 | ||
|
|
a6c9015c4e | ||
|
|
e6a5f99e5c | ||
|
|
80ff4ba63e | ||
|
|
b09a2aa308 | ||
|
|
63b6846849 | ||
|
|
139f707e6e | ||
|
|
e4546fd5bb | ||
|
|
d44e31aec2 | ||
|
|
ce9825b56b | ||
|
|
85f9d92883 | ||
|
|
916d9812a8 | ||
|
|
e6a8492242 | ||
|
|
ad0308b3f1 | ||
|
|
e97a633b63 | ||
|
|
01ac37b331 | ||
|
|
6a05b274cc | ||
|
|
98d46a3f08 | ||
|
|
4330a747d4 | ||
|
|
76de6a09fb | ||
|
|
25caf24ef9 | ||
|
|
8db3c9bc9f |
1
.github/workflows/benchmark.yml
vendored
1
.github/workflows/benchmark.yml
vendored
@@ -31,7 +31,6 @@ jobs:
|
||||
nvidia-smi
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install pandas peft
|
||||
|
||||
8
.github/workflows/build_docker_images.yml
vendored
8
.github/workflows/build_docker_images.yml
vendored
@@ -20,7 +20,7 @@ env:
|
||||
|
||||
jobs:
|
||||
test-build-docker-images:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
if: github.event_name == 'pull_request'
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
@@ -50,7 +50,7 @@ jobs:
|
||||
if: steps.file_changes.outputs.all != ''
|
||||
|
||||
build-and-push-docker-images:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
if: github.event_name != 'pull_request'
|
||||
|
||||
permissions:
|
||||
@@ -73,13 +73,13 @@ jobs:
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ env.REGISTRY }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
|
||||
378
.github/workflows/nightly_tests.yml
vendored
378
.github/workflows/nightly_tests.yml
vendored
@@ -1,6 +1,7 @@
|
||||
name: Nightly tests on main
|
||||
name: Nightly and release tests on main/release branch
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 0 * * *" # every day at midnight
|
||||
|
||||
@@ -12,110 +13,95 @@ env:
|
||||
PYTEST_TIMEOUT: 600
|
||||
RUN_SLOW: yes
|
||||
RUN_NIGHTLY: yes
|
||||
PIPELINE_USAGE_CUTOFF: 5000
|
||||
SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
jobs:
|
||||
run_nightly_tests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- name: Nightly PyTorch CUDA tests on Ubuntu
|
||||
framework: pytorch
|
||||
runner: docker-gpu
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
report: torch_cuda
|
||||
- name: Nightly Flax TPU tests on Ubuntu
|
||||
framework: flax
|
||||
runner: docker-tpu
|
||||
image: diffusers/diffusers-flax-tpu
|
||||
report: flax_tpu
|
||||
- name: Nightly ONNXRuntime CUDA tests on Ubuntu
|
||||
framework: onnxruntime
|
||||
runner: docker-gpu
|
||||
image: diffusers/diffusers-onnxruntime-cuda
|
||||
report: onnx_cuda
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
|
||||
container:
|
||||
image: ${{ matrix.config.image }}
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
setup_torch_cuda_pipeline_matrix:
|
||||
name: Setup Torch Pipelines Matrix
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
if: ${{ matrix.config.runner == 'docker-gpu' }}
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
nvidia-smi
|
||||
pip install -e .
|
||||
pip install huggingface_hub
|
||||
- name: Fetch Pipeline Matrix
|
||||
id: fetch_pipeline_matrix
|
||||
run: |
|
||||
matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
|
||||
echo $matrix
|
||||
echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Pipeline Tests Artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: test-pipelines.json
|
||||
path: reports
|
||||
|
||||
run_nightly_tests_for_torch_pipelines:
|
||||
name: Torch Pipelines CUDA Nightly Tests
|
||||
needs: setup_torch_cuda_pipeline_matrix
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- name: NVIDIA-SMI
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run nightly PyTorch CUDA tests
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
|
||||
- name: Nightly PyTorch CUDA checkpoint (pipelines) tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
--report-log=${{ matrix.config.report }}.log \
|
||||
tests/
|
||||
|
||||
- name: Run nightly Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
--report-log=${{ matrix.config.report }}.log \
|
||||
tests/
|
||||
|
||||
- name: Run nightly ONNXRuntime CUDA tests
|
||||
if: ${{ matrix.config.framework == 'onnxruntime' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
--report-log=${{ matrix.config.report }}.log \
|
||||
tests/
|
||||
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
--report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
|
||||
run: |
|
||||
cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
|
||||
cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ matrix.config.report }}_test_reports
|
||||
name: pipeline_${{ matrix.module }}_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
@@ -124,9 +110,251 @@ jobs:
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_nightly_tests_for_other_torch_modules:
|
||||
name: Torch Non-Pipelines CUDA Nightly Tests
|
||||
runs-on: docker-gpu
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
module: [models, schedulers, others, examples]
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly PyTorch CUDA tests for non-pipeline modules
|
||||
if: ${{ matrix.module != 'examples'}}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_${{ matrix.module }}_cuda \
|
||||
--report-log=tests_torch_${{ matrix.module }}_cuda.log \
|
||||
tests/${{ matrix.module }}
|
||||
|
||||
- name: Run nightly example tests with Torch
|
||||
if: ${{ matrix.module == 'examples' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v --make-reports=examples_torch_cuda \
|
||||
--report-log=examples_torch_cuda.log \
|
||||
examples/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
|
||||
cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: torch_${{ matrix.module }}_cuda_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_lora_nightly_tests:
|
||||
name: Nightly LoRA Tests with PEFT and TORCH
|
||||
runs-on: docker-gpu
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly LoRA tests with PEFT and Torch
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_lora_cuda \
|
||||
--report-log=tests_torch_lora_cuda.log \
|
||||
tests/lora
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_torch_lora_cuda_stats.txt
|
||||
cat reports/tests_torch_lora_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: torch_lora_cuda_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_flax_tpu_tests:
|
||||
name: Nightly Flax TPU Tests
|
||||
runs-on: docker-tpu
|
||||
if: github.event_name == 'schedule'
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-flax-tpu
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly Flax TPU tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_flax_tpu \
|
||||
--report-log=tests_flax_tpu.log \
|
||||
tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_flax_tpu_stats.txt
|
||||
cat reports/tests_flax_tpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: flax_tpu_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_nightly_onnx_tests:
|
||||
name: Nightly ONNXRuntime CUDA tests on Ubuntu
|
||||
runs-on: docker-gpu
|
||||
container:
|
||||
image: diffusers/diffusers-onnxruntime-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly ONNXRuntime CUDA tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_onnx_cuda \
|
||||
--report-log=tests_onnx_cuda.log \
|
||||
tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_onnx_cuda_stats.txt
|
||||
cat reports/tests_onnx_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ matrix.config.report }}_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_nightly_tests_apple_m1:
|
||||
name: Nightly PyTorch MPS tests on MacOS
|
||||
runs-on: [ self-hosted, apple-m1 ]
|
||||
if: github.event_name == 'schedule'
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
|
||||
3
.github/workflows/pr_test_fetcher.yml
vendored
3
.github/workflows/pr_test_fetcher.yml
vendored
@@ -32,7 +32,6 @@ jobs:
|
||||
fetch-depth: 0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
- name: Environment
|
||||
@@ -89,7 +88,6 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install -e [quality,test]
|
||||
python -m pip install accelerate
|
||||
@@ -147,7 +145,6 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install -e [quality,test]
|
||||
|
||||
|
||||
21
.github/workflows/pr_test_peft_backend.yml
vendored
21
.github/workflows/pr_test_peft_backend.yml
vendored
@@ -32,9 +32,11 @@ jobs:
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: make quality
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
ruff check examples tests src utils scripts
|
||||
ruff format examples tests src utils scripts --check
|
||||
echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
check_repository_consistency:
|
||||
needs: check_code_quality
|
||||
@@ -49,11 +51,15 @@ jobs:
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
- name: Check repo consistency
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_fast_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
@@ -65,7 +71,7 @@ jobs:
|
||||
|
||||
name: LoRA - ${{ matrix.lib-versions }}
|
||||
|
||||
runs-on: docker-cpu
|
||||
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
@@ -83,11 +89,10 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
if [ "${{ matrix.lib-versions }}" == "main" ]; then
|
||||
python -m uv pip install -U peft@git+https://github.com/huggingface/peft.git
|
||||
python -m pip install -U peft@git+https://github.com/huggingface/peft.git
|
||||
python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
else
|
||||
@@ -102,7 +107,7 @@ jobs:
|
||||
- name: Run fast PyTorch LoRA CPU tests with PEFT backend
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/lora/test_lora_layers_peft.py
|
||||
tests/lora/
|
||||
|
||||
32
.github/workflows/pr_tests.yml
vendored
32
.github/workflows/pr_tests.yml
vendored
@@ -40,9 +40,11 @@ jobs:
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: make quality
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
ruff check examples tests src utils scripts
|
||||
ruff format examples tests src utils scripts --check
|
||||
echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
check_repository_consistency:
|
||||
needs: check_code_quality
|
||||
@@ -57,11 +59,15 @@ jobs:
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
- name: Check repo consistency
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_fast_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
@@ -71,22 +77,22 @@ jobs:
|
||||
config:
|
||||
- name: Fast PyTorch Pipeline CPU tests
|
||||
framework: pytorch_pipelines
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu_pipelines
|
||||
- name: Fast PyTorch Models & Schedulers CPU tests
|
||||
framework: pytorch_models
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu_models_schedulers
|
||||
- name: Fast Flax CPU tests
|
||||
framework: flax
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-flax-cpu
|
||||
report: flax_cpu
|
||||
- name: PyTorch Example CPU tests
|
||||
framework: pytorch_examples
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_example_cpu
|
||||
|
||||
@@ -110,7 +116,6 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate
|
||||
@@ -124,7 +129,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/pipelines
|
||||
@@ -133,7 +138,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_models' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx and not Dependency" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/models tests/schedulers tests/others
|
||||
@@ -142,7 +147,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests
|
||||
@@ -152,7 +157,7 @@ jobs:
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install peft
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples
|
||||
|
||||
@@ -175,7 +180,7 @@ jobs:
|
||||
config:
|
||||
- name: Hub tests for models, schedulers, and pipelines
|
||||
framework: hub_tests_pytorch
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_hub
|
||||
|
||||
@@ -199,7 +204,6 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
|
||||
|
||||
44
.github/workflows/push_tests.yml
vendored
44
.github/workflows/push_tests.yml
vendored
@@ -21,10 +21,7 @@ env:
|
||||
jobs:
|
||||
setup_torch_cuda_pipeline_matrix:
|
||||
name: Setup Torch Pipelines CUDA Slow Tests Matrix
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
|
||||
options: --shm-size "16gb" --ipc host
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
|
||||
steps:
|
||||
@@ -32,24 +29,20 @@ jobs:
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
pip install -e .
|
||||
pip install huggingface_hub
|
||||
- name: Fetch Pipeline Matrix
|
||||
id: fetch_pipeline_matrix
|
||||
run: |
|
||||
matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
|
||||
echo $matrix
|
||||
echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Pipeline Tests Artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
@@ -67,7 +60,7 @@ jobs:
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --privileged
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -76,9 +69,14 @@ jobs:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
- name: Tailscale
|
||||
uses: huggingface/tailscale-action@v1
|
||||
with:
|
||||
authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
|
||||
slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
|
||||
slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
@@ -95,6 +93,12 @@ jobs:
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
- name: Tailscale Wait
|
||||
if: ${{ failure() || runner.debug == '1' }}
|
||||
uses: huggingface/tailscale-action@v1
|
||||
with:
|
||||
waitForSSH: true
|
||||
authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
@@ -128,7 +132,6 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
@@ -178,11 +181,10 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
python -m pip install -U peft@git+https://github.com/huggingface/peft.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -229,7 +231,6 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
@@ -277,7 +278,6 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
@@ -437,4 +437,4 @@ jobs:
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: examples_test_reports
|
||||
path: reports
|
||||
path: reports
|
||||
|
||||
17
.github/workflows/push_tests_fast.yml
vendored
17
.github/workflows/push_tests_fast.yml
vendored
@@ -29,22 +29,22 @@ jobs:
|
||||
config:
|
||||
- name: Fast PyTorch CPU tests on Ubuntu
|
||||
framework: pytorch
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu
|
||||
- name: Fast Flax CPU tests on Ubuntu
|
||||
framework: flax
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-flax-cpu
|
||||
report: flax_cpu
|
||||
- name: Fast ONNXRuntime CPU tests on Ubuntu
|
||||
framework: onnxruntime
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-onnxruntime-cpu
|
||||
report: onnx_cpu
|
||||
- name: PyTorch Example CPU tests on Ubuntu
|
||||
framework: pytorch_examples
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_example_cpu
|
||||
|
||||
@@ -68,7 +68,6 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
|
||||
@@ -81,7 +80,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
@@ -90,7 +89,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
@@ -99,7 +98,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'onnxruntime' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
@@ -109,7 +108,7 @@ jobs:
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install peft
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples
|
||||
|
||||
|
||||
4
.github/workflows/pypi_publish.yaml
vendored
4
.github/workflows/pypi_publish.yaml
vendored
@@ -52,7 +52,9 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -U setuptools wheel twine torch
|
||||
pip install -U setuptools wheel twine
|
||||
pip install -U torch --index-url https://download.pytorch.org/whl/cpu
|
||||
pip install -U transformers
|
||||
|
||||
- name: Build the dist files
|
||||
run: python setup.py bdist_wheel && python setup.py sdist
|
||||
|
||||
46
.github/workflows/ssh-runner.yml
vendored
Normal file
46
.github/workflows/ssh-runner.yml
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
name: SSH into runners
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
runner_type:
|
||||
description: 'Type of runner to test (a10 or t4)'
|
||||
required: true
|
||||
docker_image:
|
||||
description: 'Name of the Docker image'
|
||||
required: true
|
||||
|
||||
env:
|
||||
IS_GITHUB_CI: "1"
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
HF_HOME: /mnt/cache
|
||||
DIFFUSERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
RUN_SLOW: yes
|
||||
|
||||
jobs:
|
||||
ssh_runner:
|
||||
name: "SSH"
|
||||
runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
|
||||
container:
|
||||
image: ${{ github.event.inputs.docker_image }}
|
||||
options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Tailscale # In order to be able to SSH when a test fails
|
||||
uses: huggingface/tailscale-action@v1
|
||||
with:
|
||||
authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
|
||||
slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
|
||||
slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
waitForSSH: true
|
||||
30
.github/workflows/update_metadata.yml
vendored
Normal file
30
.github/workflows/update_metadata.yml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Update Diffusers metadata
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- update_diffusers_metadata*
|
||||
|
||||
jobs:
|
||||
update_metadata:
|
||||
runs-on: ubuntu-22.04
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -l {0}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Setup environment
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install datasets pandas
|
||||
pip install .[torch]
|
||||
|
||||
- name: Update metadata
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
|
||||
run: |
|
||||
python utils/update_metadata.py --commit_sha ${{ github.sha }}
|
||||
10
CITATION.cff
10
CITATION.cff
@@ -19,6 +19,16 @@ authors:
|
||||
family-names: Rasul
|
||||
- given-names: Mishig
|
||||
family-names: Davaadorj
|
||||
- given-names: Dhruv
|
||||
family-names: Nair
|
||||
- given-names: Sayak
|
||||
family-names: Paul
|
||||
- given-names: Steven
|
||||
family-names: Liu
|
||||
- given-names: William
|
||||
family-names: Berman
|
||||
- given-names: Yiyi
|
||||
family-names: Xu
|
||||
- given-names: Thomas
|
||||
family-names: Wolf
|
||||
repository-code: 'https://github.com/huggingface/diffusers'
|
||||
|
||||
2
Makefile
2
Makefile
@@ -42,6 +42,7 @@ repo-consistency:
|
||||
quality:
|
||||
ruff check $(check_dirs) setup.py
|
||||
ruff format --check $(check_dirs) setup.py
|
||||
doc-builder style src/diffusers docs/source --max_len 119 --check_only
|
||||
python utils/check_doc_toc.py
|
||||
|
||||
# Format source code automatically and check is there are any problems left that need manual fixing
|
||||
@@ -55,6 +56,7 @@ extra_style_checks:
|
||||
style:
|
||||
ruff check $(check_dirs) setup.py --fix
|
||||
ruff format $(check_dirs) setup.py
|
||||
doc-builder style src/diffusers docs/source --max_len 119
|
||||
${MAKE} autogenerate_code
|
||||
${MAKE} extra_style_checks
|
||||
|
||||
|
||||
@@ -238,7 +238,7 @@ We also want to thank @heejkoo for the very helpful overview of papers, code and
|
||||
|
||||
```bibtex
|
||||
@misc{von-platen-etal-2022-diffusers,
|
||||
author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Thomas Wolf},
|
||||
author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Dhruv Nair and Sayak Paul and William Berman and Yiyi Xu and Steven Liu and Thomas Wolf},
|
||||
title = {Diffusers: State-of-the-art diffusion models},
|
||||
year = {2022},
|
||||
publisher = {GitHub},
|
||||
|
||||
@@ -12,6 +12,7 @@ RUN apt update && \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
libgl1 \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
|
||||
@@ -12,6 +12,7 @@ RUN apt update && \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
libgl1 \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
|
||||
@@ -12,6 +12,7 @@ RUN apt update && \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
libgl1 \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
|
||||
@@ -12,6 +12,7 @@ RUN apt update && \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
libgl1 \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
|
||||
@@ -23,154 +23,138 @@
|
||||
title: Accelerate inference of text-to-image diffusion models
|
||||
title: Tutorials
|
||||
- sections:
|
||||
- sections:
|
||||
- local: using-diffusers/loading_overview
|
||||
title: Overview
|
||||
- local: using-diffusers/loading
|
||||
title: Load pipelines, models, and schedulers
|
||||
- local: using-diffusers/schedulers
|
||||
title: Load and compare different schedulers
|
||||
- local: using-diffusers/custom_pipeline_overview
|
||||
title: Load community pipelines and components
|
||||
- local: using-diffusers/using_safetensors
|
||||
title: Load safetensors
|
||||
- local: using-diffusers/other-formats
|
||||
title: Load different Stable Diffusion formats
|
||||
- local: using-diffusers/loading_adapters
|
||||
title: Load adapters
|
||||
- local: using-diffusers/push_to_hub
|
||||
title: Push files to the Hub
|
||||
title: Loading & Hub
|
||||
- sections:
|
||||
- local: using-diffusers/pipeline_overview
|
||||
title: Overview
|
||||
- local: using-diffusers/unconditional_image_generation
|
||||
title: Unconditional image generation
|
||||
- local: using-diffusers/conditional_image_generation
|
||||
title: Text-to-image
|
||||
- local: using-diffusers/img2img
|
||||
title: Image-to-image
|
||||
- local: using-diffusers/inpaint
|
||||
title: Inpainting
|
||||
- local: using-diffusers/text-img2vid
|
||||
title: Text or image-to-video
|
||||
- local: using-diffusers/depth2img
|
||||
title: Depth-to-image
|
||||
title: Tasks
|
||||
- sections:
|
||||
- local: using-diffusers/textual_inversion_inference
|
||||
title: Textual inversion
|
||||
- local: using-diffusers/ip_adapter
|
||||
title: IP-Adapter
|
||||
- local: using-diffusers/merge_loras
|
||||
title: Merge LoRAs
|
||||
- local: training/distributed_inference
|
||||
title: Distributed inference with multiple GPUs
|
||||
- local: using-diffusers/reusing_seeds
|
||||
title: Improve image quality with deterministic generation
|
||||
- local: using-diffusers/control_brightness
|
||||
title: Control image brightness
|
||||
- local: using-diffusers/weighted_prompts
|
||||
title: Prompt weighting
|
||||
- local: using-diffusers/freeu
|
||||
title: Improve generation quality with FreeU
|
||||
title: Techniques
|
||||
- sections:
|
||||
- local: using-diffusers/pipeline_overview
|
||||
title: Overview
|
||||
- local: using-diffusers/sdxl
|
||||
title: Stable Diffusion XL
|
||||
- local: using-diffusers/sdxl_turbo
|
||||
title: SDXL Turbo
|
||||
- local: using-diffusers/kandinsky
|
||||
title: Kandinsky
|
||||
- local: using-diffusers/controlnet
|
||||
title: ControlNet
|
||||
- local: using-diffusers/shap-e
|
||||
title: Shap-E
|
||||
- local: using-diffusers/diffedit
|
||||
title: DiffEdit
|
||||
- local: using-diffusers/distilled_sd
|
||||
title: Distilled Stable Diffusion inference
|
||||
- local: using-diffusers/callback
|
||||
title: Pipeline callbacks
|
||||
- local: using-diffusers/reproducibility
|
||||
title: Create reproducible pipelines
|
||||
- local: using-diffusers/custom_pipeline_examples
|
||||
title: Community pipelines
|
||||
- local: using-diffusers/contribute_pipeline
|
||||
title: Contribute a community pipeline
|
||||
- local: using-diffusers/inference_with_lcm_lora
|
||||
title: Latent Consistency Model-LoRA
|
||||
- local: using-diffusers/inference_with_lcm
|
||||
title: Latent Consistency Model
|
||||
- local: using-diffusers/inference_with_tcd_lora
|
||||
title: Trajectory Consistency Distillation-LoRA
|
||||
- local: using-diffusers/svd
|
||||
title: Stable Video Diffusion
|
||||
title: Specific pipeline examples
|
||||
- sections:
|
||||
- local: training/overview
|
||||
title: Overview
|
||||
- local: training/create_dataset
|
||||
title: Create a dataset for training
|
||||
- local: training/adapt_a_model
|
||||
title: Adapt a model to a new task
|
||||
- sections:
|
||||
- local: training/unconditional_training
|
||||
title: Unconditional image generation
|
||||
- local: training/text2image
|
||||
title: Text-to-image
|
||||
- local: training/sdxl
|
||||
title: Stable Diffusion XL
|
||||
- local: training/kandinsky
|
||||
title: Kandinsky 2.2
|
||||
- local: training/wuerstchen
|
||||
title: Wuerstchen
|
||||
- local: training/controlnet
|
||||
title: ControlNet
|
||||
- local: training/t2i_adapters
|
||||
title: T2I-Adapters
|
||||
- local: training/instructpix2pix
|
||||
title: InstructPix2Pix
|
||||
title: Models
|
||||
- sections:
|
||||
- local: training/text_inversion
|
||||
title: Textual Inversion
|
||||
- local: training/dreambooth
|
||||
title: DreamBooth
|
||||
- local: training/lora
|
||||
title: LoRA
|
||||
- local: training/custom_diffusion
|
||||
title: Custom Diffusion
|
||||
- local: training/lcm_distill
|
||||
title: Latent Consistency Distillation
|
||||
- local: training/ddpo
|
||||
title: Reinforcement learning training with DDPO
|
||||
title: Methods
|
||||
title: Training
|
||||
- sections:
|
||||
- local: using-diffusers/other-modalities
|
||||
title: Other Modalities
|
||||
title: Taking Diffusers Beyond Images
|
||||
title: Using Diffusers
|
||||
- local: using-diffusers/loading
|
||||
title: Load pipelines
|
||||
- local: using-diffusers/custom_pipeline_overview
|
||||
title: Load community pipelines and components
|
||||
- local: using-diffusers/schedulers
|
||||
title: Load schedulers and models
|
||||
- local: using-diffusers/using_safetensors
|
||||
title: Load safetensors
|
||||
- local: using-diffusers/other-formats
|
||||
title: Load different Stable Diffusion formats
|
||||
- local: using-diffusers/loading_adapters
|
||||
title: Load adapters
|
||||
- local: using-diffusers/push_to_hub
|
||||
title: Push files to the Hub
|
||||
title: Load pipelines and adapters
|
||||
- sections:
|
||||
- local: optimization/opt_overview
|
||||
- local: using-diffusers/unconditional_image_generation
|
||||
title: Unconditional image generation
|
||||
- local: using-diffusers/conditional_image_generation
|
||||
title: Text-to-image
|
||||
- local: using-diffusers/img2img
|
||||
title: Image-to-image
|
||||
- local: using-diffusers/inpaint
|
||||
title: Inpainting
|
||||
- local: using-diffusers/text-img2vid
|
||||
title: Text or image-to-video
|
||||
- local: using-diffusers/depth2img
|
||||
title: Depth-to-image
|
||||
title: Generative tasks
|
||||
- sections:
|
||||
- local: using-diffusers/overview_techniques
|
||||
title: Overview
|
||||
- local: training/distributed_inference
|
||||
title: Distributed inference with multiple GPUs
|
||||
- local: using-diffusers/merge_loras
|
||||
title: Merge LoRAs
|
||||
- local: using-diffusers/callback
|
||||
title: Pipeline callbacks
|
||||
- local: using-diffusers/reusing_seeds
|
||||
title: Reproducible pipelines
|
||||
- local: using-diffusers/image_quality
|
||||
title: Controlling image quality
|
||||
- local: using-diffusers/weighted_prompts
|
||||
title: Prompt techniques
|
||||
title: Inference techniques
|
||||
- sections:
|
||||
- local: using-diffusers/sdxl
|
||||
title: Stable Diffusion XL
|
||||
- local: using-diffusers/sdxl_turbo
|
||||
title: SDXL Turbo
|
||||
- local: using-diffusers/kandinsky
|
||||
title: Kandinsky
|
||||
- local: using-diffusers/ip_adapter
|
||||
title: IP-Adapter
|
||||
- local: using-diffusers/controlnet
|
||||
title: ControlNet
|
||||
- local: using-diffusers/t2i_adapter
|
||||
title: T2I-Adapter
|
||||
- local: using-diffusers/textual_inversion_inference
|
||||
title: Textual inversion
|
||||
- local: using-diffusers/shap-e
|
||||
title: Shap-E
|
||||
- local: using-diffusers/diffedit
|
||||
title: DiffEdit
|
||||
- local: using-diffusers/inference_with_lcm_lora
|
||||
title: Latent Consistency Model-LoRA
|
||||
- local: using-diffusers/inference_with_lcm
|
||||
title: Latent Consistency Model
|
||||
- local: using-diffusers/inference_with_tcd_lora
|
||||
title: Trajectory Consistency Distillation-LoRA
|
||||
- local: using-diffusers/svd
|
||||
title: Stable Video Diffusion
|
||||
title: Specific pipeline examples
|
||||
- sections:
|
||||
- local: training/overview
|
||||
title: Overview
|
||||
- local: training/create_dataset
|
||||
title: Create a dataset for training
|
||||
- local: training/adapt_a_model
|
||||
title: Adapt a model to a new task
|
||||
- sections:
|
||||
- local: optimization/fp16
|
||||
title: Speed up inference
|
||||
- local: optimization/memory
|
||||
title: Reduce memory usage
|
||||
- local: optimization/torch2.0
|
||||
title: PyTorch 2.0
|
||||
- local: optimization/xformers
|
||||
title: xFormers
|
||||
- local: optimization/tome
|
||||
title: Token merging
|
||||
- local: optimization/deepcache
|
||||
title: DeepCache
|
||||
title: General optimizations
|
||||
- local: training/unconditional_training
|
||||
title: Unconditional image generation
|
||||
- local: training/text2image
|
||||
title: Text-to-image
|
||||
- local: training/sdxl
|
||||
title: Stable Diffusion XL
|
||||
- local: training/kandinsky
|
||||
title: Kandinsky 2.2
|
||||
- local: training/wuerstchen
|
||||
title: Wuerstchen
|
||||
- local: training/controlnet
|
||||
title: ControlNet
|
||||
- local: training/t2i_adapters
|
||||
title: T2I-Adapters
|
||||
- local: training/instructpix2pix
|
||||
title: InstructPix2Pix
|
||||
title: Models
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: training/text_inversion
|
||||
title: Textual Inversion
|
||||
- local: training/dreambooth
|
||||
title: DreamBooth
|
||||
- local: training/lora
|
||||
title: LoRA
|
||||
- local: training/custom_diffusion
|
||||
title: Custom Diffusion
|
||||
- local: training/lcm_distill
|
||||
title: Latent Consistency Distillation
|
||||
- local: training/ddpo
|
||||
title: Reinforcement learning training with DDPO
|
||||
title: Methods
|
||||
isExpanded: false
|
||||
title: Training
|
||||
- sections:
|
||||
- local: optimization/fp16
|
||||
title: Speed up inference
|
||||
- local: using-diffusers/distilled_sd
|
||||
title: Distilled Stable Diffusion inference
|
||||
- local: optimization/memory
|
||||
title: Reduce memory usage
|
||||
- local: optimization/torch2.0
|
||||
title: PyTorch 2.0
|
||||
- local: optimization/xformers
|
||||
title: xFormers
|
||||
- local: optimization/tome
|
||||
title: Token merging
|
||||
- local: optimization/deepcache
|
||||
title: DeepCache
|
||||
- local: optimization/tgate
|
||||
title: TGATE
|
||||
- sections:
|
||||
- local: using-diffusers/stable_diffusion_jax_how_to
|
||||
title: JAX/Flax
|
||||
@@ -180,14 +164,14 @@
|
||||
title: OpenVINO
|
||||
- local: optimization/coreml
|
||||
title: Core ML
|
||||
title: Optimized model types
|
||||
title: Optimized model formats
|
||||
- sections:
|
||||
- local: optimization/mps
|
||||
title: Metal Performance Shaders (MPS)
|
||||
- local: optimization/habana
|
||||
title: Habana Gaudi
|
||||
title: Optimized hardware
|
||||
title: Optimization
|
||||
title: Accelerate inference and reduce memory
|
||||
- sections:
|
||||
- local: conceptual/philosophy
|
||||
title: Philosophy
|
||||
@@ -209,6 +193,7 @@
|
||||
- local: api/outputs
|
||||
title: Outputs
|
||||
title: Main Classes
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: api/loaders/ip_adapter
|
||||
title: IP-Adapter
|
||||
@@ -223,6 +208,7 @@
|
||||
- local: api/loaders/peft
|
||||
title: PEFT
|
||||
title: Loaders
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: api/models/overview
|
||||
title: Overview
|
||||
@@ -257,6 +243,7 @@
|
||||
- local: api/models/controlnet
|
||||
title: ControlNet
|
||||
title: Models
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: api/pipelines/overview
|
||||
title: Overview
|
||||
@@ -280,6 +267,10 @@
|
||||
title: ControlNet
|
||||
- local: api/pipelines/controlnet_sdxl
|
||||
title: ControlNet with Stable Diffusion XL
|
||||
- local: api/pipelines/controlnetxs
|
||||
title: ControlNet-XS
|
||||
- local: api/pipelines/controlnetxs_sdxl
|
||||
title: ControlNet-XS with Stable Diffusion XL
|
||||
- local: api/pipelines/dance_diffusion
|
||||
title: Dance Diffusion
|
||||
- local: api/pipelines/ddim
|
||||
@@ -358,7 +349,7 @@
|
||||
- local: api/pipelines/stable_diffusion/ldm3d_diffusion
|
||||
title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
|
||||
- local: api/pipelines/stable_diffusion/adapter
|
||||
title: Stable Diffusion T2I-Adapter
|
||||
title: T2I-Adapter
|
||||
- local: api/pipelines/stable_diffusion/gligen
|
||||
title: GLIGEN (Grounded Language-to-Image Generation)
|
||||
title: Stable Diffusion
|
||||
@@ -377,6 +368,7 @@
|
||||
- local: api/pipelines/wuerstchen
|
||||
title: Wuerstchen
|
||||
title: Pipelines
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: api/schedulers/overview
|
||||
title: Overview
|
||||
@@ -400,14 +392,14 @@
|
||||
title: DPMSolverSDEScheduler
|
||||
- local: api/schedulers/singlestep_dpm_solver
|
||||
title: DPMSolverSinglestepScheduler
|
||||
- local: api/schedulers/edm_multistep_dpm_solver
|
||||
title: EDMDPMSolverMultistepScheduler
|
||||
- local: api/schedulers/edm_euler
|
||||
title: EDMEulerScheduler
|
||||
- local: api/schedulers/euler_ancestral
|
||||
title: EulerAncestralDiscreteScheduler
|
||||
- local: api/schedulers/euler
|
||||
title: EulerDiscreteScheduler
|
||||
- local: api/schedulers/edm_euler
|
||||
title: EDMEulerScheduler
|
||||
- local: api/schedulers/edm_multistep_dpm_solver
|
||||
title: EDMDPMSolverMultistepScheduler
|
||||
- local: api/schedulers/heun
|
||||
title: HeunDiscreteScheduler
|
||||
- local: api/schedulers/ipndm
|
||||
@@ -437,6 +429,7 @@
|
||||
- local: api/schedulers/vq_diffusion
|
||||
title: VQDiffusionScheduler
|
||||
title: Schedulers
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: api/internal_classes_overview
|
||||
title: Overview
|
||||
@@ -451,4 +444,5 @@
|
||||
- local: api/image_processor
|
||||
title: VAE Image Processor
|
||||
title: Internal classes
|
||||
isExpanded: false
|
||||
title: API
|
||||
|
||||
@@ -408,6 +408,29 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
|
||||
|
||||
</Tip>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align=center>Without FreeInit enabled</th>
|
||||
<th align=center>With FreeInit enabled</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align=center>
|
||||
panda playing a guitar
|
||||
<br />
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-no-freeinit.gif"
|
||||
alt="panda playing a guitar"
|
||||
style="width: 300px;" />
|
||||
</td>
|
||||
<td align=center>
|
||||
panda playing a guitar
|
||||
<br/>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-freeinit.gif"
|
||||
alt="panda playing a guitar"
|
||||
style="width: 300px;" />
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## Using AnimateLCM
|
||||
|
||||
[AnimateLCM](https://animatelcm.github.io/) is a motion module checkpoint and an [LCM LoRA](https://huggingface.co/docs/diffusers/using-diffusers/inference_with_lcm_lora) that have been created using a consistency learning strategy that decouples the distillation of the image generation priors and the motion generation priors.
|
||||
|
||||
@@ -20,7 +20,8 @@ The abstract of the paper is the following:
|
||||
|
||||
*Although audio generation shares commonalities across different types of audio, such as speech, music, and sound effects, designing models for each type requires careful consideration of specific objectives and biases that can significantly differ from those of other types. To bring us closer to a unified perspective of audio generation, this paper proposes a framework that utilizes the same learning method for speech, music, and sound effect generation. Our framework introduces a general representation of audio, called "language of audio" (LOA). Any audio can be translated into LOA based on AudioMAE, a self-supervised pre-trained representation learning model. In the generation process, we translate any modalities into LOA by using a GPT-2 model, and we perform self-supervised audio generation learning with a latent diffusion model conditioned on LOA. The proposed framework naturally brings advantages such as in-context learning abilities and reusable self-supervised pretrained AudioMAE and latent diffusion models. Experiments on the major benchmarks of text-to-audio, text-to-music, and text-to-speech demonstrate state-of-the-art or competitive performance against previous approaches. Our code, pretrained model, and demo are available at [this https URL](https://audioldm.github.io/audioldm2).*
|
||||
|
||||
This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original codebase can be found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2).
|
||||
This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi) and [Nguyễn Công Tú Anh](https://github.com/tuanh123789). The original codebase can be
|
||||
found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2).
|
||||
|
||||
## Tips
|
||||
|
||||
@@ -36,6 +37,8 @@ See table below for details on the three checkpoints:
|
||||
| [audioldm2](https://huggingface.co/cvssp/audioldm2) | Text-to-audio | 350M | 1.1B | 1150k |
|
||||
| [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 750M | 1.5B | 1150k |
|
||||
| [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 350M | 1.1B | 665k |
|
||||
| [audioldm2-gigaspeech](https://huggingface.co/anhnct/audioldm2_gigaspeech) | Text-to-speech | 350M | 1.1B |10k |
|
||||
| [audioldm2-ljspeech](https://huggingface.co/anhnct/audioldm2_ljspeech) | Text-to-speech | 350M | 1.1B | |
|
||||
|
||||
### Constructing a prompt
|
||||
|
||||
@@ -53,7 +56,7 @@ See table below for details on the three checkpoints:
|
||||
* The quality of the generated waveforms can vary significantly based on the seed. Try generating with different seeds until you find a satisfactory generation.
|
||||
* Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.
|
||||
|
||||
The following example demonstrates how to construct good music generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).
|
||||
The following example demonstrates how to construct good music and speech generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).
|
||||
|
||||
<Tip>
|
||||
|
||||
|
||||
@@ -12,42 +12,10 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# AutoPipeline
|
||||
|
||||
`AutoPipeline` is designed to:
|
||||
|
||||
1. make it easy for you to load a checkpoint for a task without knowing the specific pipeline class to use
|
||||
2. use multiple pipelines in your workflow
|
||||
|
||||
Based on the task, the `AutoPipeline` class automatically retrieves the relevant pipeline given the name or path to the pretrained weights with the `from_pretrained()` method.
|
||||
|
||||
To seamlessly switch between tasks with the same checkpoint without reallocating additional memory, use the `from_pipe()` method to transfer the components from the original pipeline to the new one.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
image = pipeline(prompt, num_inference_steps=25).images[0]
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Check out the [AutoPipeline](../../tutorials/autopipeline) tutorial to learn how to use this API!
|
||||
|
||||
</Tip>
|
||||
|
||||
`AutoPipeline` supports text-to-image, image-to-image, and inpainting for the following diffusion models:
|
||||
|
||||
- [Stable Diffusion](./stable_diffusion/overview)
|
||||
- [ControlNet](./controlnet)
|
||||
- [Stable Diffusion XL (SDXL)](./stable_diffusion/stable_diffusion_xl)
|
||||
- [DeepFloyd IF](./deepfloyd_if)
|
||||
- [Kandinsky 2.1](./kandinsky)
|
||||
- [Kandinsky 2.2](./kandinsky_v22)
|
||||
The `AutoPipeline` is designed to make it easy to load a checkpoint for a task without needing to know the specific pipeline class. Based on the task, the `AutoPipeline` automatically retrieves the correct pipeline class from the checkpoint `model_index.json` file.
|
||||
|
||||
> [!TIP]
|
||||
> Check out the [AutoPipeline](../../tutorials/autopipeline) tutorial to learn how to use this API!
|
||||
|
||||
## AutoPipelineForText2Image
|
||||
|
||||
|
||||
@@ -1,3 +1,15 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# ControlNet-XS
|
||||
|
||||
ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
|
||||
@@ -12,5 +24,16 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe
|
||||
|
||||
This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
|
||||
|
||||
<Tip>
|
||||
|
||||
> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## StableDiffusionControlNetXSPipeline
|
||||
[[autodoc]] StableDiffusionControlNetXSPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableDiffusionPipelineOutput
|
||||
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
|
||||
@@ -1,3 +1,15 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# ControlNet-XS with Stable Diffusion XL
|
||||
|
||||
ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
|
||||
@@ -12,4 +24,22 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe
|
||||
|
||||
This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
|
||||
|
||||
> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
<Tip warning={true}>
|
||||
|
||||
🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve!
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## StableDiffusionXLControlNetXSPipeline
|
||||
[[autodoc]] StableDiffusionXLControlNetXSPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableDiffusionPipelineOutput
|
||||
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
|
||||
@@ -97,6 +97,11 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
|
||||
- to
|
||||
- components
|
||||
|
||||
|
||||
[[autodoc]] pipelines.StableDiffusionMixin.enable_freeu
|
||||
|
||||
[[autodoc]] pipelines.StableDiffusionMixin.disable_freeu
|
||||
|
||||
## FlaxDiffusionPipeline
|
||||
|
||||
[[autodoc]] pipelines.pipeline_flax_utils.FlaxDiffusionPipeline
|
||||
|
||||
@@ -10,9 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Text-to-Image Generation with Adapter Conditioning
|
||||
|
||||
## Overview
|
||||
# T2I-Adapter
|
||||
|
||||
[T2I-Adapter: Learning Adapters to Dig out More Controllable Ability for Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.08453) by Chong Mou, Xintao Wang, Liangbin Xie, Jian Zhang, Zhongang Qi, Ying Shan, Xiaohu Qie.
|
||||
|
||||
@@ -24,236 +22,26 @@ The abstract of the paper is the following:
|
||||
|
||||
This model was contributed by the community contributor [HimariO](https://github.com/HimariO) ❤️ .
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning* | -
|
||||
| [StableDiffusionXLAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning on StableDiffusion-XL* | -
|
||||
|
||||
## Usage example with the base model of StableDiffusion-1.4/1.5
|
||||
|
||||
In the following we give a simple example of how to use a *T2I-Adapter* checkpoint with Diffusers for inference based on StableDiffusion-1.4/1.5.
|
||||
All adapters use the same pipeline.
|
||||
|
||||
1. Images are first converted into the appropriate *control image* format.
|
||||
2. The *control image* and *prompt* are passed to the [`StableDiffusionAdapterPipeline`].
|
||||
|
||||
Let's have a look at a simple example using the [Color Adapter](https://huggingface.co/TencentARC/t2iadapter_color_sd14v1).
|
||||
|
||||
```python
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_ref.png")
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
Then we can create our color palette by simply resizing it to 8 by 8 pixels and then scaling it back to original size.
|
||||
|
||||
```python
|
||||
from PIL import Image
|
||||
|
||||
color_palette = image.resize((8, 8))
|
||||
color_palette = color_palette.resize((512, 512), resample=Image.Resampling.NEAREST)
|
||||
```
|
||||
|
||||
Let's take a look at the processed image.
|
||||
|
||||

|
||||
|
||||
|
||||
Next, create the adapter pipeline
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
|
||||
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_color_sd14v1", torch_dtype=torch.float16)
|
||||
pipe = StableDiffusionAdapterPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
adapter=adapter,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe.to("cuda")
|
||||
```
|
||||
|
||||
Finally, pass the prompt and control image to the pipeline
|
||||
|
||||
```py
|
||||
# fix the random seed, so you will get the same result as the example
|
||||
generator = torch.Generator("cuda").manual_seed(7)
|
||||
|
||||
out_image = pipe(
|
||||
"At night, glowing cubes in front of the beach",
|
||||
image=color_palette,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
make_image_grid([image, color_palette, out_image], rows=1, cols=3)
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Usage example with the base model of StableDiffusion-XL
|
||||
|
||||
In the following we give a simple example of how to use a *T2I-Adapter* checkpoint with Diffusers for inference based on StableDiffusion-XL.
|
||||
All adapters use the same pipeline.
|
||||
|
||||
1. Images are first downloaded into the appropriate *control image* format.
|
||||
2. The *control image* and *prompt* are passed to the [`StableDiffusionXLAdapterPipeline`].
|
||||
|
||||
Let's have a look at a simple example using the [Sketch Adapter](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0).
|
||||
|
||||
```python
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
sketch_image = load_image("https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png").convert("L")
|
||||
```
|
||||
|
||||

|
||||
|
||||
Then, create the adapter pipeline
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import (
|
||||
T2IAdapter,
|
||||
StableDiffusionXLAdapterPipeline,
|
||||
DDPMScheduler
|
||||
)
|
||||
|
||||
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
adapter = T2IAdapter.from_pretrained("Adapter/t2iadapter", subfolder="sketch_sdxl_1.0", torch_dtype=torch.float16, adapter_type="full_adapter_xl")
|
||||
scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
|
||||
|
||||
pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
|
||||
model_id, adapter=adapter, safety_checker=None, torch_dtype=torch.float16, variant="fp16", scheduler=scheduler
|
||||
)
|
||||
|
||||
pipe.to("cuda")
|
||||
```
|
||||
|
||||
Finally, pass the prompt and control image to the pipeline
|
||||
|
||||
```py
|
||||
# fix the random seed, so you will get the same result as the example
|
||||
generator = torch.Generator().manual_seed(42)
|
||||
|
||||
sketch_image_out = pipe(
|
||||
prompt="a photo of a dog in real world, high quality",
|
||||
negative_prompt="extra digit, fewer digits, cropped, worst quality, low quality",
|
||||
image=sketch_image,
|
||||
generator=generator,
|
||||
guidance_scale=7.5
|
||||
).images[0]
|
||||
make_image_grid([sketch_image, sketch_image_out], rows=1, cols=2)
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Available checkpoints
|
||||
|
||||
Non-diffusers checkpoints can be found under [TencentARC/T2I-Adapter](https://huggingface.co/TencentARC/T2I-Adapter/tree/main/models).
|
||||
|
||||
### T2I-Adapter with Stable Diffusion 1.4
|
||||
|
||||
| Model Name | Control Image Overview| Control Image Example | Generated Image Example |
|
||||
|---|---|---|---|
|
||||
|[TencentARC/t2iadapter_color_sd14v1](https://huggingface.co/TencentARC/t2iadapter_color_sd14v1)<br/> *Trained with spatial color palette* | An image with 8x8 color palette.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_canny_sd14v1](https://huggingface.co/TencentARC/t2iadapter_canny_sd14v1)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_sketch_sd14v1](https://huggingface.co/TencentARC/t2iadapter_sketch_sd14v1)<br/> *Trained with [PidiNet](https://github.com/zhuoinoulu/pidinet) edge detection* | A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_depth_sd14v1](https://huggingface.co/TencentARC/t2iadapter_depth_sd14v1)<br/> *Trained with Midas depth estimation* | A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_openpose_sd14v1](https://huggingface.co/TencentARC/t2iadapter_openpose_sd14v1)<br/> *Trained with OpenPose bone image* | A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_keypose_sd14v1](https://huggingface.co/TencentARC/t2iadapter_keypose_sd14v1)<br/> *Trained with mmpose skeleton image* | A [mmpose skeleton](https://github.com/open-mmlab/mmpose) image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_seg_sd14v1](https://huggingface.co/TencentARC/t2iadapter_seg_sd14v1)<br/>*Trained with semantic segmentation* | An [custom](https://github.com/TencentARC/T2I-Adapter/discussions/25) segmentation protocol image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_output.png"/></a> |
|
||||
|[TencentARC/t2iadapter_canny_sd15v2](https://huggingface.co/TencentARC/t2iadapter_canny_sd15v2)||
|
||||
|[TencentARC/t2iadapter_depth_sd15v2](https://huggingface.co/TencentARC/t2iadapter_depth_sd15v2)||
|
||||
|[TencentARC/t2iadapter_sketch_sd15v2](https://huggingface.co/TencentARC/t2iadapter_sketch_sd15v2)||
|
||||
|[TencentARC/t2iadapter_zoedepth_sd15v1](https://huggingface.co/TencentARC/t2iadapter_zoedepth_sd15v1)||
|
||||
|[Adapter/t2iadapter, subfolder='sketch_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0)||
|
||||
|[Adapter/t2iadapter, subfolder='canny_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/canny_sdxl_1.0)||
|
||||
|[Adapter/t2iadapter, subfolder='openpose_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/openpose_sdxl_1.0)||
|
||||
|
||||
## Combining multiple adapters
|
||||
|
||||
[`MultiAdapter`] can be used for applying multiple conditionings at once.
|
||||
|
||||
Here we use the keypose adapter for the character posture and the depth adapter for creating the scene.
|
||||
|
||||
```py
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
cond_keypose = load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"
|
||||
)
|
||||
cond_depth = load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"
|
||||
)
|
||||
cond = [cond_keypose, cond_depth]
|
||||
|
||||
prompt = ["A man walking in an office room with a nice view"]
|
||||
```
|
||||
|
||||
The two control images look as such:
|
||||
|
||||

|
||||

|
||||
|
||||
|
||||
`MultiAdapter` combines keypose and depth adapters.
|
||||
|
||||
`adapter_conditioning_scale` balances the relative influence of the different adapters.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionAdapterPipeline, MultiAdapter, T2IAdapter
|
||||
|
||||
adapters = MultiAdapter(
|
||||
[
|
||||
T2IAdapter.from_pretrained("TencentARC/t2iadapter_keypose_sd14v1"),
|
||||
T2IAdapter.from_pretrained("TencentARC/t2iadapter_depth_sd14v1"),
|
||||
]
|
||||
)
|
||||
adapters = adapters.to(torch.float16)
|
||||
|
||||
pipe = StableDiffusionAdapterPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
torch_dtype=torch.float16,
|
||||
adapter=adapters,
|
||||
).to("cuda")
|
||||
|
||||
image = pipe(prompt, cond, adapter_conditioning_scale=[0.8, 0.8]).images[0]
|
||||
make_image_grid([cond_keypose, cond_depth, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
## T2I-Adapter vs ControlNet
|
||||
|
||||
T2I-Adapter is similar to [ControlNet](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet).
|
||||
T2I-Adapter uses a smaller auxiliary network which is only run once for the entire diffusion process.
|
||||
However, T2I-Adapter performs slightly worse than ControlNet.
|
||||
|
||||
## StableDiffusionAdapterPipeline
|
||||
|
||||
[[autodoc]] StableDiffusionAdapterPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
## StableDiffusionXLAdapterPipeline
|
||||
|
||||
[[autodoc]] StableDiffusionXLAdapterPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
@@ -172,3 +172,41 @@ inpaint = StableDiffusionInpaintPipeline(**text2img.components)
|
||||
|
||||
# now you can use text2img(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
|
||||
```
|
||||
|
||||
### Create web demos using `gradio`
|
||||
|
||||
The Stable Diffusion pipelines are automatically supported in [Gradio](https://github.com/gradio-app/gradio/), a library that makes creating beautiful and user-friendly machine learning apps on the web a breeze. First, make sure you have Gradio installed:
|
||||
|
||||
```
|
||||
pip install -U gradio
|
||||
```
|
||||
|
||||
Then, create a web demo around any Stable Diffusion-based pipeline. For example, you can create an image generation pipeline in a single line of code with Gradio's [`Interface.from_pipeline`](https://www.gradio.app/docs/interface#interface-from-pipeline) function:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import gradio as gr
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
|
||||
gr.Interface.from_pipeline(pipe).launch()
|
||||
```
|
||||
|
||||
which opens an intuitive drag-and-drop interface in your browser:
|
||||
|
||||

|
||||
|
||||
Similarly, you could create a demo for an image-to-image pipeline with:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionImg2ImgPipeline
|
||||
import gradio as gr
|
||||
|
||||
|
||||
pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
|
||||
gr.Interface.from_pipeline(pipe).launch()
|
||||
```
|
||||
|
||||
By default, the web demo runs on a local server. If you'd like to share it with others, you can generate a temporary public
|
||||
link by setting `share=True` in `launch()`. Or, you can host your demo on [Hugging Face Spaces](https://huggingface.co/spaces)https://huggingface.co/spaces for a permanent link.
|
||||
@@ -37,3 +37,7 @@ Utility and helper functions for working with 🤗 Diffusers.
|
||||
## make_image_grid
|
||||
|
||||
[[autodoc]] utils.make_image_grid
|
||||
|
||||
## randn_tensor
|
||||
|
||||
[[autodoc]] utils.torch_utils.randn_tensor
|
||||
|
||||
@@ -198,38 +198,81 @@ Anything displayed on [the official Diffusers doc page](https://huggingface.co/d
|
||||
|
||||
Please have a look at [this page](https://github.com/huggingface/diffusers/tree/main/docs) on how to verify changes made to the documentation locally.
|
||||
|
||||
|
||||
### 6. Contribute a community pipeline
|
||||
|
||||
[Pipelines](https://huggingface.co/docs/diffusers/api/pipelines/overview) are usually the first point of contact between the Diffusers library and the user.
|
||||
Pipelines are examples of how to use Diffusers [models](https://huggingface.co/docs/diffusers/api/models/overview) and [schedulers](https://huggingface.co/docs/diffusers/api/schedulers/overview).
|
||||
We support two types of pipelines:
|
||||
> [!TIP]
|
||||
> Read the [Community pipelines](../using-diffusers/custom_pipeline_overview#community-pipelines) guide to learn more about the difference between a GitHub and Hugging Face Hub community pipeline. If you're interested in why we have community pipelines, take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) (basically, we can't maintain all the possible ways diffusion models can be used for inference but we also don't want to prevent the community from building them).
|
||||
|
||||
- Official Pipelines
|
||||
- Community Pipelines
|
||||
Contributing a community pipeline is a great way to share your creativity and work with the community. It lets you build on top of the [`DiffusionPipeline`] so that anyone can load and use it by setting the `custom_pipeline` parameter. This section will walk you through how to create a simple pipeline where the UNet only does a single forward pass and calls the scheduler once (a "one-step" pipeline).
|
||||
|
||||
Both official and community pipelines follow the same design and consist of the same type of components.
|
||||
1. Create a one_step_unet.py file for your community pipeline. This file can contain whatever package you want to use as long as it's installed by the user. Make sure you only have one pipeline class that inherits from [`DiffusionPipeline`] to load model weights and the scheduler configuration from the Hub. Add a UNet and scheduler to the `__init__` function.
|
||||
|
||||
Official pipelines are tested and maintained by the core maintainers of Diffusers. Their code
|
||||
resides in [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
|
||||
In contrast, community pipelines are contributed and maintained purely by the **community** and are **not** tested.
|
||||
They reside in [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and while they can be accessed via the [PyPI diffusers package](https://pypi.org/project/diffusers/), their code is not part of the PyPI distribution.
|
||||
You should also add the `register_modules` function to ensure your pipeline and its components can be saved with [`~DiffusionPipeline.save_pretrained`].
|
||||
|
||||
The reason for the distinction is that the core maintainers of the Diffusers library cannot maintain and test all
|
||||
possible ways diffusion models can be used for inference, but some of them may be of interest to the community.
|
||||
Officially released diffusion pipelines,
|
||||
such as Stable Diffusion are added to the core src/diffusers/pipelines package which ensures
|
||||
high quality of maintenance, no backward-breaking code changes, and testing.
|
||||
More bleeding edge pipelines should be added as community pipelines. If usage for a community pipeline is high, the pipeline can be moved to the official pipelines upon request from the community. This is one of the ways we strive to be a community-driven library.
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
To add a community pipeline, one should add a <name-of-the-community>.py file to [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and adapt the [examples/community/README.md](https://github.com/huggingface/diffusers/tree/main/examples/community/README.md) to include an example of the new pipeline.
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
An example can be seen [here](https://github.com/huggingface/diffusers/pull/2400).
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
```
|
||||
|
||||
Community pipeline PRs are only checked at a superficial level and ideally they should be maintained by their original authors.
|
||||
1. In the forward pass (which we recommend defining as `__call__`), you can add any feature you'd like. For the "one-step" pipeline, create a random image and call the UNet and scheduler once by setting `timestep=1`.
|
||||
|
||||
Contributing a community pipeline is a great way to understand how Diffusers models and schedulers work. Having contributed a community pipeline is usually the first stepping stone to contributing an official pipeline to the
|
||||
core package.
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
|
||||
def __call__(self):
|
||||
image = torch.randn(
|
||||
(1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
|
||||
)
|
||||
timestep = 1
|
||||
|
||||
model_output = self.unet(image, timestep).sample
|
||||
scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
|
||||
|
||||
return scheduler_output
|
||||
```
|
||||
|
||||
Now you can run the pipeline by passing a UNet and scheduler to it or load pretrained weights if the pipeline structure is identical.
|
||||
|
||||
```py
|
||||
from diffusers import DDPMScheduler, UNet2DModel
|
||||
|
||||
scheduler = DDPMScheduler()
|
||||
unet = UNet2DModel()
|
||||
|
||||
pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
|
||||
output = pipeline()
|
||||
# load pretrained weights
|
||||
pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
output = pipeline()
|
||||
```
|
||||
|
||||
You can either share your pipeline as a GitHub community pipeline or Hub community pipeline.
|
||||
|
||||
<hfoptions id="pipeline type">
|
||||
<hfoption id="GitHub pipeline">
|
||||
|
||||
Share your GitHub pipeline by opening a pull request on the Diffusers [repository](https://github.com/huggingface/diffusers) and add the one_step_unet.py file to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder.
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Hub pipeline">
|
||||
|
||||
Share your Hub pipeline by creating a model repository on the Hub and uploading the one_step_unet.py file to it.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### 7. Contribute to training examples
|
||||
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Overview
|
||||
|
||||
Generating high-quality outputs is computationally intensive, especially during each iterative step where you go from a noisy output to a less noisy output. One of 🤗 Diffuser's goals is to make this technology widely accessible to everyone, which includes enabling fast inference on consumer and specialized hardware.
|
||||
|
||||
This section will cover tips and tricks - like half-precision weights and sliced attention - for optimizing inference speed and reducing memory-consumption. You'll also learn how to speed up your PyTorch code with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) or [ONNX Runtime](https://onnxruntime.ai/docs/), and enable memory-efficient attention with [xFormers](https://facebookresearch.github.io/xformers/). There are also guides for running inference on specific hardware like Apple Silicon, and Intel or Habana processors.
|
||||
182
docs/source/en/optimization/tgate.md
Normal file
182
docs/source/en/optimization/tgate.md
Normal file
@@ -0,0 +1,182 @@
|
||||
# T-GATE
|
||||
|
||||
[T-GATE](https://github.com/HaozheLiu-ST/T-GATE/tree/main) accelerates inference for [Stable Diffusion](../api/pipelines/stable_diffusion/overview), [PixArt](../api/pipelines/pixart), and [Latency Consistency Model](../api/pipelines/latent_consistency_models.md) pipelines by skipping the cross-attention calculation once it converges. This method doesn't require any additional training and it can speed up inference from 10-50%. T-GATE is also compatible with other optimization methods like [DeepCache](./deepcache).
|
||||
|
||||
Before you begin, make sure you install T-GATE.
|
||||
|
||||
```bash
|
||||
pip install tgate
|
||||
pip install -U pytorch diffusers transformers accelerate DeepCache
|
||||
```
|
||||
|
||||
|
||||
To use T-GATE with a pipeline, you need to use its corresponding loader.
|
||||
|
||||
| Pipeline | T-GATE Loader |
|
||||
|---|---|
|
||||
| PixArt | TgatePixArtLoader |
|
||||
| Stable Diffusion XL | TgateSDXLLoader |
|
||||
| Stable Diffusion XL + DeepCache | TgateSDXLDeepCacheLoader |
|
||||
| Stable Diffusion | TgateSDLoader |
|
||||
| Stable Diffusion + DeepCache | TgateSDDeepCacheLoader |
|
||||
|
||||
Next, create a `TgateLoader` with a pipeline, the gate step (the time step to stop calculating the cross attention), and the number of inference steps. Then call the `tgate` method on the pipeline with a prompt, gate step, and the number of inference steps.
|
||||
|
||||
Let's see how to enable this for several different pipelines.
|
||||
|
||||
<hfoptions id="pipelines">
|
||||
<hfoption id="PixArt">
|
||||
|
||||
Accelerate `PixArtAlphaPipeline` with T-GATE:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import PixArtAlphaPipeline
|
||||
from tgate import TgatePixArtLoader
|
||||
|
||||
pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
|
||||
|
||||
gate_step = 8
|
||||
inference_step = 25
|
||||
pipe = TgatePixArtLoader(
|
||||
pipe,
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step,
|
||||
).to("cuda")
|
||||
|
||||
image = pipe.tgate(
|
||||
"An alpaca made of colorful building blocks, cyberpunk.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step,
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="Stable Diffusion XL">
|
||||
|
||||
Accelerate `StableDiffusionXLPipeline` with T-GATE:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from diffusers import DPMSolverMultistepScheduler
|
||||
from tgate import TgateSDXLLoader
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
)
|
||||
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
gate_step = 10
|
||||
inference_step = 25
|
||||
pipe = TgateSDXLLoader(
|
||||
pipe,
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step,
|
||||
).to("cuda")
|
||||
|
||||
image = pipe.tgate(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="StableDiffusionXL with DeepCache">
|
||||
|
||||
Accelerate `StableDiffusionXLPipeline` with [DeepCache](https://github.com/horseee/DeepCache) and T-GATE:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from diffusers import DPMSolverMultistepScheduler
|
||||
from tgate import TgateSDXLDeepCacheLoader
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
)
|
||||
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
gate_step = 10
|
||||
inference_step = 25
|
||||
pipe = TgateSDXLDeepCacheLoader(
|
||||
pipe,
|
||||
cache_interval=3,
|
||||
cache_branch_id=0,
|
||||
).to("cuda")
|
||||
|
||||
image = pipe.tgate(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="Latent Consistency Model">
|
||||
|
||||
Accelerate `latent-consistency/lcm-sdxl` with T-GATE:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from diffusers import UNet2DConditionModel, LCMScheduler
|
||||
from diffusers import DPMSolverMultistepScheduler
|
||||
from tgate import TgateSDXLLoader
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"latent-consistency/lcm-sdxl",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
)
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
unet=unet,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
)
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
gate_step = 1
|
||||
inference_step = 4
|
||||
pipe = TgateSDXLLoader(
|
||||
pipe,
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step,
|
||||
lcm=True
|
||||
).to("cuda")
|
||||
|
||||
image = pipe.tgate(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
T-GATE also supports [`StableDiffusionPipeline`] and [PixArt-alpha/PixArt-LCM-XL-2-1024-MS](https://hf.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS).
|
||||
|
||||
## Benchmarks
|
||||
| Model | MACs | Param | Latency | Zero-shot 10K-FID on MS-COCO |
|
||||
|-----------------------|----------|-----------|---------|---------------------------|
|
||||
| SD-1.5 | 16.938T | 859.520M | 7.032s | 23.927 |
|
||||
| SD-1.5 w/ T-GATE | 9.875T | 815.557M | 4.313s | 20.789 |
|
||||
| SD-2.1 | 38.041T | 865.785M | 16.121s | 22.609 |
|
||||
| SD-2.1 w/ T-GATE | 22.208T | 815.433 M | 9.878s | 19.940 |
|
||||
| SD-XL | 149.438T | 2.570B | 53.187s | 24.628 |
|
||||
| SD-XL w/ T-GATE | 84.438T | 2.024B | 27.932s | 22.738 |
|
||||
| Pixart-Alpha | 107.031T | 611.350M | 61.502s | 38.669 |
|
||||
| Pixart-Alpha w/ T-GATE | 65.318T | 462.585M | 37.867s | 35.825 |
|
||||
| DeepCache (SD-XL) | 57.888T | - | 19.931s | 23.755 |
|
||||
| DeepCache w/ T-GATE | 43.868T | - | 14.666s | 23.999 |
|
||||
| LCM (SD-XL) | 11.955T | 2.570B | 3.805s | 25.044 |
|
||||
| LCM w/ T-GATE | 11.171T | 2.024B | 3.533s | 25.028 |
|
||||
| LCM (Pixart-Alpha) | 8.563T | 611.350M | 4.733s | 36.086 |
|
||||
| LCM w/ T-GATE | 7.623T | 462.585M | 4.543s | 37.048 |
|
||||
|
||||
The latency is tested on an NVIDIA 1080TI, MACs and Params are calculated with [calflops](https://github.com/MrYxJ/calculate-flops.pytorch), and the FID is calculated with [PytorchFID](https://github.com/mseitzer/pytorch-fid).
|
||||
@@ -49,7 +49,7 @@ One of the simplest ways to speed up inference is to place the pipeline on a GPU
|
||||
pipeline = pipeline.to("cuda")
|
||||
```
|
||||
|
||||
To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reproducibility):
|
||||
To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reusing_seeds):
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
@@ -88,7 +88,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -54,7 +54,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -84,7 +84,7 @@ Many of the basic parameters are described in the [DreamBooth](dreambooth#script
|
||||
- `--freeze_model`: freezes the key and value parameters in the cross-attention layer; the default is `crossattn_kv`, but you can set it to `crossattn` to train all the parameters in the cross-attention layer
|
||||
- `--concepts_list`: to learn multiple concepts, provide a path to a JSON file containing the concepts
|
||||
- `--modifier_token`: a special word used to represent the learned concept
|
||||
- `--initializer_token`:
|
||||
- `--initializer_token`: a special word used to initialize the embeddings of the `modifier_token`
|
||||
|
||||
### Prior preservation loss
|
||||
|
||||
|
||||
@@ -52,6 +52,76 @@ To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](h
|
||||
|
||||
</Tip>
|
||||
|
||||
### Device placement
|
||||
|
||||
> [!WARNING]
|
||||
> This feature is experimental and its APIs might change in the future.
|
||||
|
||||
With Accelerate, you can use the `device_map` to determine how to distribute the models of a pipeline across multiple devices. This is useful in situations where you have more than one GPU.
|
||||
|
||||
For example, if you have two 8GB GPUs, then using [`~DiffusionPipeline.enable_model_cpu_offload`] may not work so well because:
|
||||
|
||||
* it only works on a single GPU
|
||||
* a single model might not fit on a single GPU ([`~DiffusionPipeline.enable_sequential_cpu_offload`] might work but it will be extremely slow and it is also limited to a single GPU)
|
||||
|
||||
To make use of both GPUs, you can use the "balanced" device placement strategy which splits the models across all available GPUs.
|
||||
|
||||
> [!WARNING]
|
||||
> Only the "balanced" strategy is supported at the moment, and we plan to support additional mapping strategies in the future.
|
||||
|
||||
```diff
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
- "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
|
||||
+ "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, device_map="balanced"
|
||||
)
|
||||
image = pipeline("a dog").images[0]
|
||||
image
|
||||
```
|
||||
|
||||
You can also pass a dictionary to enforce the maximum GPU memory that can be used on each device:
|
||||
|
||||
```diff
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
max_memory = {0:"1GB", 1:"1GB"}
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
device_map="balanced",
|
||||
+ max_memory=max_memory
|
||||
)
|
||||
image = pipeline("a dog").images[0]
|
||||
image
|
||||
```
|
||||
|
||||
If a device is not present in `max_memory`, then it will be completely ignored and will not participate in the device placement.
|
||||
|
||||
By default, Diffusers uses the maximum memory of all devices. If the models don't fit on the GPUs, they are offloaded to the CPU. If the CPU doesn't have enough memory, then you might see an error. In that case, you could defer to using [`~DiffusionPipeline.enable_sequential_cpu_offload`] and [`~DiffusionPipeline.enable_model_cpu_offload`].
|
||||
|
||||
Call [`~DiffusionPipeline.reset_device_map`] to reset the `device_map` of a pipeline. This is also necessary if you want to use methods like `to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped.
|
||||
|
||||
```py
|
||||
pipeline.reset_device_map()
|
||||
```
|
||||
|
||||
Once a pipeline has been device-mapped, you can also access its device map via `hf_device_map`:
|
||||
|
||||
```py
|
||||
print(pipeline.hf_device_map)
|
||||
```
|
||||
|
||||
An example device map would look like so:
|
||||
|
||||
|
||||
```bash
|
||||
{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
|
||||
```
|
||||
|
||||
## PyTorch Distributed
|
||||
|
||||
PyTorch supports [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) which enables data parallelism.
|
||||
|
||||
@@ -67,7 +67,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -180,7 +180,7 @@ elif args.pretrained_model_name_or_path:
|
||||
revision=args.revision,
|
||||
use_fast=False,
|
||||
)
|
||||
|
||||
|
||||
# Load scheduler and models
|
||||
noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
||||
text_encoder = text_encoder_cls.from_pretrained(
|
||||
|
||||
@@ -51,7 +51,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -89,7 +89,7 @@ The dataset preprocessing code and training loop are found in the [`main()`](htt
|
||||
|
||||
As with the script parameters, a walkthrough of the training script is provided in the [Text-to-image](text2image#training-script) training guide. Instead, this guide takes a look at the InstructPix2Pix relevant parts of the script.
|
||||
|
||||
The script begins by modifing the [number of input channels](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L445) in the first convolutional layer of the UNet to account for InstructPix2Pix's additional conditioning image:
|
||||
The script begins by modifying the [number of input channels](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L445) in the first convolutional layer of the UNet to account for InstructPix2Pix's additional conditioning image:
|
||||
|
||||
```py
|
||||
in_channels = 8
|
||||
|
||||
@@ -59,7 +59,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -235,7 +235,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_prior.py \
|
||||
--validation_prompts="A robot pokemon, 4k photo" \
|
||||
--report_to="wandb" \
|
||||
--push_to_hub \
|
||||
--output_dir="kandi2-prior-pokemon-model"
|
||||
--output_dir="kandi2-prior-pokemon-model"
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -259,7 +259,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_decoder.py \
|
||||
--validation_prompts="A robot pokemon, 4k photo" \
|
||||
--report_to="wandb" \
|
||||
--push_to_hub \
|
||||
--output_dir="kandi2-decoder-pokemon-model"
|
||||
--output_dir="kandi2-decoder-pokemon-model"
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
@@ -53,7 +53,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -252,4 +252,4 @@ The SDXL training script is discussed in more detail in the [SDXL training](sdxl
|
||||
Congratulations on distilling a LCM model! To learn more about LCM, the following may be helpful:
|
||||
|
||||
- Learn how to use [LCMs for inference](../using-diffusers/lcm) for text-to-image, image-to-image, and with LoRA checkpoints.
|
||||
- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
|
||||
- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
|
||||
|
||||
@@ -59,7 +59,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -53,7 +53,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -69,7 +69,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -67,7 +67,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -51,7 +51,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -53,7 +53,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -173,7 +173,7 @@ pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torc
|
||||
|
||||
caption = "A cute bird pokemon holding a shield"
|
||||
images = pipeline(
|
||||
caption,
|
||||
caption,
|
||||
width=1024,
|
||||
height=1536,
|
||||
prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
|
||||
|
||||
@@ -12,75 +12,74 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# AutoPipeline
|
||||
|
||||
🤗 Diffusers is able to complete many different tasks, and you can often reuse the same pretrained weights for multiple tasks such as text-to-image, image-to-image, and inpainting. If you're new to the library and diffusion models though, it may be difficult to know which pipeline to use for a task. For example, if you're using the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint for text-to-image, you might not know that you could also use it for image-to-image and inpainting by loading the checkpoint with the [`StableDiffusionImg2ImgPipeline`] and [`StableDiffusionInpaintPipeline`] classes respectively.
|
||||
Diffusers provides many pipelines for basic tasks like generating images, videos, audio, and inpainting. On top of these, there are specialized pipelines for adapters and features like upscaling, super-resolution, and more. Different pipeline classes can even use the same checkpoint because they share the same pretrained model! With so many different pipelines, it can be overwhelming to know which pipeline class to use.
|
||||
|
||||
The `AutoPipeline` class is designed to simplify the variety of pipelines in 🤗 Diffusers. It is a generic, *task-first* pipeline that lets you focus on the task. The `AutoPipeline` automatically detects the correct pipeline class to use, which makes it easier to load a checkpoint for a task without knowing the specific pipeline class name.
|
||||
The [AutoPipeline](../api/pipelines/auto_pipeline) class is designed to simplify the variety of pipelines in Diffusers. It is a generic *task-first* pipeline that lets you focus on a task ([`AutoPipelineForText2Image`], [`AutoPipelineForImage2Image`], and [`AutoPipelineForInpainting`]) without needing to know the specific pipeline class. The [AutoPipeline](../api/pipelines/auto_pipeline) automatically detects the correct pipeline class to use.
|
||||
|
||||
<Tip>
|
||||
For example, let's use the [dreamlike-art/dreamlike-photoreal-2.0](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0) checkpoint.
|
||||
|
||||
Take a look at the [AutoPipeline](../api/pipelines/auto_pipeline) reference to see which tasks are supported. Currently, it supports text-to-image, image-to-image, and inpainting.
|
||||
Under the hood, [AutoPipeline](../api/pipelines/auto_pipeline):
|
||||
|
||||
</Tip>
|
||||
1. Detects a `"stable-diffusion"` class from the [model_index.json](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0/blob/main/model_index.json) file.
|
||||
2. Depending on the task you're interested in, it loads the [`StableDiffusionPipeline`], [`StableDiffusionImg2ImgPipeline`], or [`StableDiffusionInpaintPipeline`]. Any parameter (`strength`, `num_inference_steps`, etc.) you would pass to these specific pipelines can also be passed to the [AutoPipeline](../api/pipelines/auto_pipeline).
|
||||
|
||||
This tutorial shows you how to use an `AutoPipeline` to automatically infer the pipeline class to load for a specific task, given the pretrained weights.
|
||||
|
||||
## Choose an AutoPipeline for your task
|
||||
|
||||
Start by picking a checkpoint. For example, if you're interested in text-to-image with the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint, use [`AutoPipelineForText2Image`]:
|
||||
<hfoptions id="autopipeline">
|
||||
<hfoption id="text-to-image">
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
pipe_txt2img = AutoPipelineForText2Image.from_pretrained(
|
||||
"dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
prompt = "peasant and dragon combat, wood cutting style, viking era, bevel with rune"
|
||||
|
||||
image = pipeline(prompt, num_inference_steps=25).images[0]
|
||||
prompt = "cinematic photo of Godzilla eating sushi with a cat in a izakaya, 35mm photograph, film, professional, 4k, highly detailed"
|
||||
generator = torch.Generator(device="cpu").manual_seed(37)
|
||||
image = pipe_txt2img(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-text2img.png" alt="generated image of peasant fighting dragon in wood cutting style"/>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-text2img.png"/>
|
||||
</div>
|
||||
|
||||
Under the hood, [`AutoPipelineForText2Image`]:
|
||||
|
||||
1. automatically detects a `"stable-diffusion"` class from the [`model_index.json`](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json) file
|
||||
2. loads the corresponding text-to-image [`StableDiffusionPipeline`] based on the `"stable-diffusion"` class name
|
||||
|
||||
Likewise, for image-to-image, [`AutoPipelineForImage2Image`] detects a `"stable-diffusion"` checkpoint from the `model_index.json` file and it'll load the corresponding [`StableDiffusionImg2ImgPipeline`] behind the scenes. You can also pass any additional arguments specific to the pipeline class such as `strength`, which determines the amount of noise or variation added to an input image:
|
||||
</hfoption>
|
||||
<hfoption id="image-to-image">
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
import requests
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
pipe_img2img = AutoPipelineForImage2Image.from_pretrained(
|
||||
"dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
prompt = "a portrait of a dog wearing a pearl earring"
|
||||
|
||||
url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg"
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-text2img.png")
|
||||
|
||||
response = requests.get(url)
|
||||
image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
image.thumbnail((768, 768))
|
||||
|
||||
image = pipeline(prompt, image, num_inference_steps=200, strength=0.75, guidance_scale=10.5).images[0]
|
||||
prompt = "cinematic photo of Godzilla eating burgers with a cat in a fast food restaurant, 35mm photograph, film, professional, 4k, highly detailed"
|
||||
generator = torch.Generator(device="cpu").manual_seed(53)
|
||||
image = pipe_img2img(prompt, image=init_image, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
Notice how the [dreamlike-art/dreamlike-photoreal-2.0](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0) checkpoint is used for both text-to-image and image-to-image tasks? To save memory and avoid loading the checkpoint twice, use the [`~DiffusionPipeline.from_pipe`] method.
|
||||
|
||||
```py
|
||||
pipe_img2img = AutoPipelineForImage2Image.from_pipe(pipe_txt2img).to("cuda")
|
||||
image = pipeline(prompt, image=init_image, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
You can learn more about the [`~DiffusionPipeline.from_pipe`] method in the [Reuse a pipeline](../using-diffusers/loading#reuse-a-pipeline) guide.
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-img2img.png" alt="generated image of a vermeer portrait of a dog wearing a pearl earring"/>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-img2img.png"/>
|
||||
</div>
|
||||
|
||||
And if you want to do inpainting, then [`AutoPipelineForInpainting`] loads the underlying [`StableDiffusionInpaintPipeline`] class in the same way:
|
||||
</hfoption>
|
||||
<hfoption id="inpainting">
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
@@ -91,22 +90,27 @@ pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
|
||||
img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
|
||||
mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-img2img.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-mask.png")
|
||||
|
||||
init_image = load_image(img_url).convert("RGB")
|
||||
mask_image = load_image(mask_url).convert("RGB")
|
||||
|
||||
prompt = "A majestic tiger sitting on a bench"
|
||||
image = pipeline(prompt, image=init_image, mask_image=mask_image, num_inference_steps=50, strength=0.80).images[0]
|
||||
prompt = "cinematic photo of a owl, 35mm photograph, film, professional, 4k, highly detailed"
|
||||
generator = torch.Generator(device="cpu").manual_seed(38)
|
||||
image = pipeline(prompt, image=init_image, mask_image=mask_image, generator=generator, strength=0.4).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png" alt="generated image of a tiger sitting on a bench"/>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png"/>
|
||||
</div>
|
||||
|
||||
If you try to load an unsupported checkpoint, it'll throw an error:
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Unsupported checkpoints
|
||||
|
||||
The [AutoPipeline](../api/pipelines/auto_pipeline) supports [Stable Diffusion](../api/pipelines/stable_diffusion/overview), [Stable Diffusion XL](../api/pipelines/stable_diffusion/stable_diffusion_xl), [ControlNet](../api/pipelines/controlnet), [Kandinsky 2.1](../api/pipelines/kandinsky.md), [Kandinsky 2.2](../api/pipelines/kandinsky_v22), and [DeepFloyd IF](../api/pipelines/deepfloyd_if) checkpoints.
|
||||
|
||||
If you try to load an unsupported checkpoint, you'll get an error.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
@@ -117,54 +121,3 @@ pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
)
|
||||
"ValueError: AutoPipeline can't find a pipeline linked to ShapEImg2ImgPipeline for None"
|
||||
```
|
||||
|
||||
## Use multiple pipelines
|
||||
|
||||
For some workflows or if you're loading many pipelines, it is more memory-efficient to reuse the same components from a checkpoint instead of reloading them which would unnecessarily consume additional memory. For example, if you're using a checkpoint for text-to-image and you want to use it again for image-to-image, use the [`~AutoPipelineForImage2Image.from_pipe`] method. This method creates a new pipeline from the components of a previously loaded pipeline at no additional memory cost.
|
||||
|
||||
The [`~AutoPipelineForImage2Image.from_pipe`] method detects the original pipeline class and maps it to the new pipeline class corresponding to the task you want to do. For example, if you load a `"stable-diffusion"` class pipeline for text-to-image:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
|
||||
import torch
|
||||
|
||||
pipeline_text2img = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
print(type(pipeline_text2img))
|
||||
"<class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'>"
|
||||
```
|
||||
|
||||
Then [`~AutoPipelineForImage2Image.from_pipe`] maps the original `"stable-diffusion"` pipeline class to [`StableDiffusionImg2ImgPipeline`]:
|
||||
|
||||
```py
|
||||
pipeline_img2img = AutoPipelineForImage2Image.from_pipe(pipeline_text2img)
|
||||
print(type(pipeline_img2img))
|
||||
"<class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline'>"
|
||||
```
|
||||
|
||||
If you passed an optional argument - like disabling the safety checker - to the original pipeline, this argument is also passed on to the new pipeline:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
|
||||
import torch
|
||||
|
||||
pipeline_text2img = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
requires_safety_checker=False,
|
||||
).to("cuda")
|
||||
|
||||
pipeline_img2img = AutoPipelineForImage2Image.from_pipe(pipeline_text2img)
|
||||
print(pipeline_img2img.config.requires_safety_checker)
|
||||
"False"
|
||||
```
|
||||
|
||||
You can overwrite any of the arguments and even configuration from the original pipeline if you want to change the behavior of the new pipeline. For example, to turn the safety checker back on and add the `strength` argument:
|
||||
|
||||
```py
|
||||
pipeline_img2img = AutoPipelineForImage2Image.from_pipe(pipeline_text2img, requires_safety_checker=True, strength=0.3)
|
||||
print(pipeline_img2img.config.requires_safety_checker)
|
||||
"True"
|
||||
```
|
||||
|
||||
@@ -45,7 +45,7 @@ Make sure to include the token `toy_face` in the prompt and then you can perform
|
||||
```python
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
|
||||
lora_scale= 0.9
|
||||
lora_scale = 0.9
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
|
||||
).images[0]
|
||||
@@ -114,7 +114,7 @@ To return to only using one adapter, use the [`~diffusers.loaders.UNet2DConditio
|
||||
pipe.set_adapters("toy")
|
||||
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
lora_scale= 0.9
|
||||
lora_scale = 0.9
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
|
||||
).images[0]
|
||||
@@ -127,11 +127,68 @@ Or to disable all adapters entirely, use the [`~diffusers.loaders.UNet2DConditio
|
||||
pipe.disable_lora()
|
||||
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
lora_scale= 0.9
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
### Customize adapters strength
|
||||
For even more customization, you can control how strongly the adapter affects each part of the pipeline. For this, pass a dictionary with the control strengths (called "scales") to [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`].
|
||||
|
||||
For example, here's how you can turn on the adapter for the `down` parts, but turn it off for the `mid` and `up` parts:
|
||||
```python
|
||||
pipe.enable_lora() # enable lora again, after we disabled it above
|
||||
prompt = "toy_face of a hacker with a hoodie, pixel art"
|
||||
adapter_weight_scales = { "unet": { "down": 1, "mid": 0, "up": 0} }
|
||||
pipe.set_adapters("pixel", adapter_weight_scales)
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
Let's see how turning off the `down` part and turning on the `mid` and `up` part respectively changes the image.
|
||||
```python
|
||||
adapter_weight_scales = { "unet": { "down": 0, "mid": 1, "up": 0} }
|
||||
pipe.set_adapters("pixel", adapter_weight_scales)
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
```python
|
||||
adapter_weight_scales = { "unet": { "down": 0, "mid": 0, "up": 1} }
|
||||
pipe.set_adapters("pixel", adapter_weight_scales)
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
Looks cool!
|
||||
|
||||
This is a really powerful feature. You can use it to control the adapter strengths down to per-transformer level. And you can even use it for multiple adapters.
|
||||
```python
|
||||
adapter_weight_scales_toy = 0.5
|
||||
adapter_weight_scales_pixel = {
|
||||
"unet": {
|
||||
"down": 0.9, # all transformers in the down-part will use scale 0.9
|
||||
# "mid" # because, in this example, "mid" is not given, all transformers in the mid part will use the default scale 1.0
|
||||
"up": {
|
||||
"block_0": 0.6, # all 3 transformers in the 0th block in the up-part will use scale 0.6
|
||||
"block_1": [0.4, 0.8, 1.0], # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
|
||||
}
|
||||
}
|
||||
}
|
||||
pipe.set_adapters(["toy", "pixel"], [adapter_weight_scales_toy, adapter_weight_scales_pixel])
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Manage active adapters
|
||||
|
||||
You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.LoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:
|
||||
|
||||
@@ -148,9 +148,9 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
use_safetensors=True
|
||||
).to("cuda")
|
||||
|
||||
image = pipe(
|
||||
prompt = "A croissant shaped like a cute bear."
|
||||
negative_prompt = "Deformed, ugly, bad anatomy"
|
||||
image = pipeline(
|
||||
prompt="A croissant shaped like a cute bear.",
|
||||
negative_prompt="Deformed, ugly, bad anatomy",
|
||||
callback_on_step_end=decode_tensors,
|
||||
callback_on_step_end_tensor_inputs=["latents"],
|
||||
).images[0]
|
||||
|
||||
@@ -1,184 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Contribute a community pipeline
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
|
||||
|
||||
</Tip>
|
||||
|
||||
Community pipelines allow you to add any additional features you'd like on top of the [`DiffusionPipeline`]. The main benefit of building on top of the `DiffusionPipeline` is anyone can load and use your pipeline by only adding one more argument, making it super easy for the community to access.
|
||||
|
||||
This guide will show you how to create a community pipeline and explain how they work. To keep things simple, you'll create a "one-step" pipeline where the `UNet` does a single forward pass and calls the scheduler once.
|
||||
|
||||
## Initialize the pipeline
|
||||
|
||||
You should start by creating a `one_step_unet.py` file for your community pipeline. In this file, create a pipeline class that inherits from the [`DiffusionPipeline`] to be able to load model weights and the scheduler configuration from the Hub. The one-step pipeline needs a `UNet` and a scheduler, so you'll need to add these as arguments to the `__init__` function:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
```
|
||||
|
||||
To ensure your pipeline and its components (`unet` and `scheduler`) can be saved with [`~DiffusionPipeline.save_pretrained`], add them to the `register_modules` function:
|
||||
|
||||
```diff
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
+ self.register_modules(unet=unet, scheduler=scheduler)
|
||||
```
|
||||
|
||||
Cool, the `__init__` step is done and you can move to the forward pass now! 🔥
|
||||
|
||||
## Define the forward pass
|
||||
|
||||
In the forward pass, which we recommend defining as `__call__`, you have complete creative freedom to add whatever feature you'd like. For our amazing one-step pipeline, create a random image and only call the `unet` and `scheduler` once by setting `timestep=1`:
|
||||
|
||||
```diff
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
|
||||
+ def __call__(self):
|
||||
+ image = torch.randn(
|
||||
+ (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
|
||||
+ )
|
||||
+ timestep = 1
|
||||
|
||||
+ model_output = self.unet(image, timestep).sample
|
||||
+ scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
|
||||
|
||||
+ return scheduler_output
|
||||
```
|
||||
|
||||
That's it! 🚀 You can now run this pipeline by passing a `unet` and `scheduler` to it:
|
||||
|
||||
```python
|
||||
from diffusers import DDPMScheduler, UNet2DModel
|
||||
|
||||
scheduler = DDPMScheduler()
|
||||
unet = UNet2DModel()
|
||||
|
||||
pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
|
||||
|
||||
output = pipeline()
|
||||
```
|
||||
|
||||
But what's even better is you can load pre-existing weights into the pipeline if the pipeline structure is identical. For example, you can load the [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32) weights into the one-step pipeline:
|
||||
|
||||
```python
|
||||
pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
|
||||
output = pipeline()
|
||||
```
|
||||
|
||||
## Share your pipeline
|
||||
|
||||
Open a Pull Request on the 🧨 Diffusers [repository](https://github.com/huggingface/diffusers) to add your awesome pipeline in `one_step_unet.py` to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder.
|
||||
|
||||
Once it is merged, anyone with `diffusers >= 0.4.0` installed can use this pipeline magically 🪄 by specifying it in the `custom_pipeline` argument:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"google/ddpm-cifar10-32", custom_pipeline="one_step_unet", use_safetensors=True
|
||||
)
|
||||
pipe()
|
||||
```
|
||||
|
||||
Another way to share your community pipeline is to upload the `one_step_unet.py` file directly to your preferred [model repository](https://huggingface.co/docs/hub/models-uploading) on the Hub. Instead of specifying the `one_step_unet.py` file, pass the model repository id to the `custom_pipeline` argument:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"google/ddpm-cifar10-32", custom_pipeline="stevhliu/one_step_unet", use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
Take a look at the following table to compare the two sharing workflows to help you decide the best option for you:
|
||||
|
||||
| | GitHub community pipeline | HF Hub community pipeline |
|
||||
|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
|
||||
| usage | same | same |
|
||||
| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow |
|
||||
| visibility | included in the official Diffusers repository and documentation | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` because this is automatically detected.
|
||||
|
||||
</Tip>
|
||||
|
||||
## How do community pipelines work?
|
||||
|
||||
A community pipeline is a class that inherits from [`DiffusionPipeline`] which means:
|
||||
|
||||
- It can be loaded with the [`custom_pipeline`] argument.
|
||||
- The model weights and scheduler configuration are loaded from [`pretrained_model_name_or_path`].
|
||||
- The code that implements a feature in the community pipeline is defined in a `pipeline.py` file.
|
||||
|
||||
Sometimes you can't load all the pipeline components weights from an official repository. In this case, the other components should be passed directly to the pipeline:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
from transformers import CLIPImageProcessor, CLIPModel
|
||||
|
||||
model_id = "CompVis/stable-diffusion-v1-4"
|
||||
clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
|
||||
|
||||
feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id)
|
||||
clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
model_id,
|
||||
custom_pipeline="clip_guided_stable_diffusion",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
scheduler=scheduler,
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
)
|
||||
```
|
||||
|
||||
The magic behind community pipelines is contained in the following code. It allows the community pipeline to be loaded from GitHub or the Hub, and it'll be available to all 🧨 Diffusers packages.
|
||||
|
||||
```python
|
||||
# 2. Load the pipeline class, if using custom module then load it from the Hub
|
||||
# if we load from explicit class, let's use it
|
||||
if custom_pipeline is not None:
|
||||
pipeline_class = get_class_from_dynamic_module(
|
||||
custom_pipeline, module_file=CUSTOM_PIPELINE_FILE_NAME, cache_dir=custom_pipeline
|
||||
)
|
||||
elif cls != DiffusionPipeline:
|
||||
pipeline_class = cls
|
||||
else:
|
||||
diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
|
||||
pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
|
||||
```
|
||||
@@ -1,58 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Control image brightness
|
||||
|
||||
The Stable Diffusion pipeline is mediocre at generating images that are either very bright or dark as explained in the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) paper. The solutions proposed in the paper are currently implemented in the [`DDIMScheduler`] which you can use to improve the lighting in your images.
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 Take a look at the paper linked above for more details about the proposed solutions!
|
||||
|
||||
</Tip>
|
||||
|
||||
One of the solutions is to train a model with *v prediction* and *v loss*. Add the following flag to the [`train_text_to_image.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [`train_text_to_image_lora.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts to enable `v_prediction`:
|
||||
|
||||
```bash
|
||||
--prediction_type="v_prediction"
|
||||
```
|
||||
|
||||
For example, let's use the [`ptx0/pseudo-journey-v2`](https://huggingface.co/ptx0/pseudo-journey-v2) checkpoint which has been finetuned with `v_prediction`.
|
||||
|
||||
Next, configure the following parameters in the [`DDIMScheduler`]:
|
||||
|
||||
1. `rescale_betas_zero_snr=True`, rescales the noise schedule to zero terminal signal-to-noise ratio (SNR)
|
||||
2. `timestep_spacing="trailing"`, starts sampling from the last timestep
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
|
||||
|
||||
# switch the scheduler in the pipeline to use the DDIMScheduler
|
||||
pipeline.scheduler = DDIMScheduler.from_config(
|
||||
pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
```
|
||||
|
||||
Finally, in your call to the pipeline, set `guidance_rescale` to prevent overexposure:
|
||||
|
||||
```py
|
||||
prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
|
||||
image = pipeline(prompt, guidance_rescale=0.7).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero_snr.png"/>
|
||||
</div>
|
||||
@@ -1,119 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Community pipelines
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
<Tip>
|
||||
|
||||
For more context about the design choices behind community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).
|
||||
|
||||
</Tip>
|
||||
|
||||
Community pipelines allow you to get creative and build your own unique pipelines to share with the community. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder along with inference and training examples for how to use them. This guide showcases some of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR with your own pipeline and we will merge it!).
|
||||
|
||||
To load a community pipeline, use the `custom_pipeline` argument in [`DiffusionPipeline`] to specify one of the files in [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community):
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder", use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
If a community pipeline doesn't work as expected, please open a GitHub issue and mention the author.
|
||||
|
||||
You can learn more about community pipelines in the how to [load community pipelines](custom_pipeline_overview) and how to [contribute a community pipeline](contribute_pipeline) guides.
|
||||
|
||||
## Multilingual Stable Diffusion
|
||||
|
||||
The multilingual Stable Diffusion pipeline uses a pretrained [XLM-RoBERTa](https://huggingface.co/papluca/xlm-roberta-base-language-detection) to identify a language and the [mBART-large-50](https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt) model to handle the translation. This allows you to generate images from text in 20 languages.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import make_image_grid
|
||||
from transformers import (
|
||||
pipeline,
|
||||
MBart50TokenizerFast,
|
||||
MBartForConditionalGeneration,
|
||||
)
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
device_dict = {"cuda": 0, "cpu": -1}
|
||||
|
||||
# add language detection pipeline
|
||||
language_detection_model_ckpt = "papluca/xlm-roberta-base-language-detection"
|
||||
language_detection_pipeline = pipeline("text-classification",
|
||||
model=language_detection_model_ckpt,
|
||||
device=device_dict[device])
|
||||
|
||||
# add model for language translation
|
||||
translation_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
|
||||
translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt").to(device)
|
||||
|
||||
diffuser_pipeline = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="multilingual_stable_diffusion",
|
||||
detection_pipeline=language_detection_pipeline,
|
||||
translation_model=translation_model,
|
||||
translation_tokenizer=translation_tokenizer,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
diffuser_pipeline.enable_attention_slicing()
|
||||
diffuser_pipeline = diffuser_pipeline.to(device)
|
||||
|
||||
prompt = ["a photograph of an astronaut riding a horse",
|
||||
"Una casa en la playa",
|
||||
"Ein Hund, der Orange isst",
|
||||
"Un restaurant parisien"]
|
||||
|
||||
images = diffuser_pipeline(prompt).images
|
||||
make_image_grid(images, rows=2, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://user-images.githubusercontent.com/4313860/198328706-295824a4-9856-4ce5-8e66-278ceb42fd29.png"/>
|
||||
</div>
|
||||
|
||||
## MagicMix
|
||||
|
||||
[MagicMix](https://huggingface.co/papers/2210.16056) is a pipeline that can mix an image and text prompt to generate a new image that preserves the image structure. The `mix_factor` determines how much influence the prompt has on the layout generation, `kmin` controls the number of steps during the content generation process, and `kmax` determines how much information is kept in the layout of the original image.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="magic_mix",
|
||||
scheduler=DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
|
||||
).to('cuda')
|
||||
|
||||
img = load_image("https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg")
|
||||
mix_img = pipeline(img, prompt="bed", kmin=0.3, kmax=0.5, mix_factor=0.5)
|
||||
make_image_grid([img, mix_img], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://user-images.githubusercontent.com/59410571/209578602-70f323fa-05b7-4dd6-b055-e40683e37914.jpg" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">image and text prompt mix</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
@@ -16,17 +16,27 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
## Community pipelines
|
||||
|
||||
Community pipelines are any [`DiffusionPipeline`] class that are different from the original implementation as specified in their paper (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline.
|
||||
> [!TIP] Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
|
||||
|
||||
There are many cool community pipelines like [Speech to Image](https://github.com/huggingface/diffusers/tree/main/examples/community#speech-to-image) or [Composable Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#composable-stable-diffusion), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community).
|
||||
Community pipelines are any [`DiffusionPipeline`] class that are different from the original paper implementation (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline.
|
||||
|
||||
To load any community pipeline on the Hub, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you'd like to load the pipeline weights and components from. For example, the example below loads a dummy pipeline from [`hf-internal-testing/diffusers-dummy-pipeline`](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py) and the pipeline weights and components from [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32):
|
||||
There are many cool community pipelines like [Marigold Depth Estimation](https://github.com/huggingface/diffusers/tree/main/examples/community#marigold-depth-estimation) or [InstantID](https://github.com/huggingface/diffusers/tree/main/examples/community#instantid-pipeline), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community).
|
||||
|
||||
<Tip warning={true}>
|
||||
There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code.
|
||||
|
||||
🔒 By loading a community pipeline from the Hugging Face Hub, you are trusting that the code you are loading is safe. Make sure to inspect the code online before loading and running it automatically!
|
||||
| | GitHub community pipeline | HF Hub community pipeline |
|
||||
|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
|
||||
| usage | same | same |
|
||||
| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow |
|
||||
| visibility | included in the official Diffusers repository and documentation | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
|
||||
|
||||
</Tip>
|
||||
<hfoptions id="community">
|
||||
<hfoption id="Hub pipelines">
|
||||
|
||||
To load a Hugging Face Hub community pipeline, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you'd like to load the pipeline weights and components from. For example, the example below loads a dummy pipeline from [hf-internal-testing/diffusers-dummy-pipeline](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py) and the pipeline weights and components from [google/ddpm-cifar10-32](https://huggingface.co/google/ddpm-cifar10-32):
|
||||
|
||||
> [!WARNING]
|
||||
> By loading a community pipeline from the Hugging Face Hub, you are trusting that the code you are loading is safe. Make sure to inspect the code online before loading and running it automatically!
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -36,7 +46,10 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
)
|
||||
```
|
||||
|
||||
Loading an official community pipeline is similar, but you can mix loading weights from an official repository id and pass pipeline components directly. The example below loads the community [CLIP Guided Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#clip-guided-stable-diffusion) pipeline, and you can pass the CLIP model components directly to it:
|
||||
</hfoption>
|
||||
<hfoption id="GitHub pipelines">
|
||||
|
||||
To load a GitHub community pipeline, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you you'd like to load the pipeline weights and components from. You can also load model components directly. The example below loads the community [CLIP Guided Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#clip-guided-stable-diffusion) pipeline and the CLIP model components.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -56,9 +69,12 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### Load from a local file
|
||||
|
||||
Community pipelines can also be loaded from a local file if you pass a file path instead. The path to the passed directory must contain a `pipeline.py` file that contains the pipeline class in order to successfully load it.
|
||||
Community pipelines can also be loaded from a local file if you pass a file path instead. The path to the passed directory must contain a pipeline.py file that contains the pipeline class.
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
@@ -77,7 +93,7 @@ By default, community pipelines are loaded from the latest stable version of Dif
|
||||
<hfoptions id="version">
|
||||
<hfoption id="main">
|
||||
|
||||
For example, to load from the `main` branch:
|
||||
For example, to load from the main branch:
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
@@ -93,7 +109,7 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
</hfoption>
|
||||
<hfoption id="older version">
|
||||
|
||||
For example, to load from a previous version of Diffusers like `v0.25.0`:
|
||||
For example, to load from a previous version of Diffusers like v0.25.0:
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
@@ -109,8 +125,140 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### Load with from_pipe
|
||||
|
||||
For more information about community pipelines, take a look at the [Community pipelines](custom_pipeline_examples) guide for how to use them and if you're interested in adding a community pipeline check out the [How to contribute a community pipeline](contribute_pipeline) guide!
|
||||
Community pipelines can also be loaded with the [`~DiffusionPipeline.from_pipe`] method which allows you to load and reuse multiple pipelines without any additional memory overhead (learn more in the [Reuse a pipeline](./loading#reuse-a-pipeline) guide). The memory requirement is determined by the largest single pipeline loaded.
|
||||
|
||||
For example, let's load a community pipeline that supports [long prompts with weighting](https://github.com/huggingface/diffusers/tree/main/examples/community#long-prompt-weighting-stable-diffusion) from a Stable Diffusion pipeline.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe_sd = DiffusionPipeline.from_pretrained("emilianJR/CyberRealistic_V3", torch_dtype=torch.float16)
|
||||
pipe_sd.to("cuda")
|
||||
# load long prompt weighting pipeline
|
||||
pipe_lpw = DiffusionPipeline.from_pipe(
|
||||
pipe_sd,
|
||||
custom_pipeline="lpw_stable_diffusion",
|
||||
).to("cuda")
|
||||
|
||||
prompt = "cat, hiding in the leaves, ((rain)), zazie rainyday, beautiful eyes, macro shot, colorful details, natural lighting, amazing composition, subsurface scattering, amazing textures, filmic, soft light, ultra-detailed eyes, intricate details, detailed texture, light source contrast, dramatic shadows, cinematic light, depth of field, film grain, noise, dark background, hyperrealistic dslr film still, dim volumetric cinematic lighting"
|
||||
neg_prompt = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers:1.4), (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
|
||||
generator = torch.Generator(device="cpu").manual_seed(20)
|
||||
out_lpw = pipe_lpw(
|
||||
prompt,
|
||||
negative_prompt=neg_prompt,
|
||||
width=512,
|
||||
height=512,
|
||||
max_embeddings_multiples=3,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
out_lpw
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/from_pipe_lpw.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Stable Diffusion with long prompt weighting</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/from_pipe_non_lpw.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Stable Diffusion</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Example community pipelines
|
||||
|
||||
Community pipelines are a really fun and creative way to extend the capabilities of the original pipeline with new and unique features. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder with inference and training examples for how to use them.
|
||||
|
||||
This section showcases a couple of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR for your community pipeline and ping us for a review)!
|
||||
|
||||
> [!TIP]
|
||||
> The [`~DiffusionPipeline.from_pipe`] method is particularly useful for loading community pipelines because many of them don't have pretrained weights and add a feature on top of an existing pipeline like Stable Diffusion or Stable Diffusion XL. You can learn more about the [`~DiffusionPipeline.from_pipe`] method in the [Load with from_pipe](custom_pipeline_overview#load-with-from_pipe) section.
|
||||
|
||||
<hfoptions id="community">
|
||||
<hfoption id="Marigold">
|
||||
|
||||
[Marigold](https://marigoldmonodepth.github.io/) is a depth estimation diffusion pipeline that uses the rich existing and inherent visual knowledge in diffusion models. It takes an input image and denoises and decodes it into a depth map. Marigold performs well even on images it hasn't seen before.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"prs-eth/marigold-lcm-v1-0",
|
||||
custom_pipeline="marigold_depth_estimation",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
)
|
||||
|
||||
pipeline.to("cuda")
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/community-marigold.png")
|
||||
output = pipeline(
|
||||
image,
|
||||
denoising_steps=4,
|
||||
ensemble_size=5,
|
||||
processing_res=768,
|
||||
match_input_res=True,
|
||||
batch_size=0,
|
||||
seed=33,
|
||||
color_map="Spectral",
|
||||
show_progress_bar=True,
|
||||
)
|
||||
depth_colored: Image.Image = output.depth_colored
|
||||
depth_colored.save("./depth_colored.png")
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/community-marigold.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/marigold-depth.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">colorized depth image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="HD-Painter">
|
||||
|
||||
[HD-Painter](https://hf.co/papers/2312.14091) is a high-resolution inpainting pipeline. It introduces a *Prompt-Aware Introverted Attention (PAIntA)* layer to better align a prompt with the area to be inpainted, and *Reweighting Attention Score Guidance (RASG)* to keep the latents more prompt-aligned and within their trained domain to generate realistc images.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"Lykon/dreamshaper-8-inpainting",
|
||||
custom_pipeline="hd_painter"
|
||||
)
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter.jpg")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter-mask.png")
|
||||
prompt = "football"
|
||||
image = pipeline(prompt, init_image, mask_image, use_rasg=True, use_painta=True, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter.jpg"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter-output.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Community components
|
||||
|
||||
@@ -118,7 +266,7 @@ Community components allow users to build pipelines that may have customized com
|
||||
|
||||
This section shows how users should use community components to build a community pipeline.
|
||||
|
||||
You'll use the [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) pipeline checkpoint as an example. So, let's start loading the components:
|
||||
You'll use the [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) pipeline checkpoint as an example.
|
||||
|
||||
1. Import and load the text encoder from Transformers:
|
||||
|
||||
@@ -152,17 +300,17 @@ In steps 4 and 5, the custom [UNet](https://github.com/showlab/Show-1/blob/main/
|
||||
|
||||
</Tip>
|
||||
|
||||
4. Now you'll load a [custom UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py), which in this example, has already been implemented in the `showone_unet_3d_condition.py` [script](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) for your convenience. You'll notice the `UNet3DConditionModel` class name is changed to `ShowOneUNet3DConditionModel` because [`UNet3DConditionModel`] already exists in Diffusers. Any components needed for the `ShowOneUNet3DConditionModel` class should be placed in the `showone_unet_3d_condition.py` script.
|
||||
4. Now you'll load a [custom UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py), which in this example, has already been implemented in [showone_unet_3d_condition.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) for your convenience. You'll notice the [`UNet3DConditionModel`] class name is changed to `ShowOneUNet3DConditionModel` because [`UNet3DConditionModel`] already exists in Diffusers. Any components needed for the `ShowOneUNet3DConditionModel` class should be placed in showone_unet_3d_condition.py.
|
||||
|
||||
Once this is done, you can initialize the UNet:
|
||||
Once this is done, you can initialize the UNet:
|
||||
|
||||
```python
|
||||
from showone_unet_3d_condition import ShowOneUNet3DConditionModel
|
||||
```python
|
||||
from showone_unet_3d_condition import ShowOneUNet3DConditionModel
|
||||
|
||||
unet = ShowOneUNet3DConditionModel.from_pretrained(pipe_id, subfolder="unet")
|
||||
```
|
||||
unet = ShowOneUNet3DConditionModel.from_pretrained(pipe_id, subfolder="unet")
|
||||
```
|
||||
|
||||
5. Finally, you'll load the custom pipeline code. For this example, it has already been created for you in the `pipeline_t2v_base_pixel.py` [script](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/pipeline_t2v_base_pixel.py). This script contains a custom `TextToVideoIFPipeline` class for generating videos from text. Just like the custom UNet, any code needed for the custom pipeline to work should go in the `pipeline_t2v_base_pixel.py` script.
|
||||
5. Finally, you'll load the custom pipeline code. For this example, it has already been created for you in [pipeline_t2v_base_pixel.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/pipeline_t2v_base_pixel.py). This script contains a custom `TextToVideoIFPipeline` class for generating videos from text. Just like the custom UNet, any code needed for the custom pipeline to work should go in pipeline_t2v_base_pixel.py.
|
||||
|
||||
Once everything is in place, you can initialize the `TextToVideoIFPipeline` with the `ShowOneUNet3DConditionModel`:
|
||||
|
||||
@@ -187,13 +335,16 @@ Push the pipeline to the Hub to share with the community!
|
||||
pipeline.push_to_hub("custom-t2v-pipeline")
|
||||
```
|
||||
|
||||
After the pipeline is successfully pushed, you need a couple of changes:
|
||||
After the pipeline is successfully pushed, you need to make a few changes:
|
||||
|
||||
1. Change the `_class_name` attribute in [`model_index.json`](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2) to `"pipeline_t2v_base_pixel"` and `"TextToVideoIFPipeline"`.
|
||||
2. Upload `showone_unet_3d_condition.py` to the `unet` [directory](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py).
|
||||
3. Upload `pipeline_t2v_base_pixel.py` to the pipeline base [directory](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py).
|
||||
1. Change the `_class_name` attribute in [model_index.json](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2) to `"pipeline_t2v_base_pixel"` and `"TextToVideoIFPipeline"`.
|
||||
2. Upload `showone_unet_3d_condition.py` to the [unet](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) subfolder.
|
||||
3. Upload `pipeline_t2v_base_pixel.py` to the pipeline [repository](https://huggingface.co/sayakpaul/show-1-base-with-code/tree/main).
|
||||
|
||||
To run inference, simply add the `trust_remote_code` argument while initializing the pipeline to handle all the "magic" behind the scenes.
|
||||
To run inference, add the `trust_remote_code` argument while initializing the pipeline to handle all the "magic" behind the scenes.
|
||||
|
||||
> [!WARNING]
|
||||
> As an additional precaution with `trust_remote_code=True`, we strongly encourage you to pass a commit hash to the `revision` parameter in [`~DiffusionPipeline.from_pretrained`] to make sure the code hasn't been updated with some malicious new lines of code (unless you fully trust the model owners).
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -221,10 +372,9 @@ video_frames = pipeline(
|
||||
).frames
|
||||
```
|
||||
|
||||
As an additional reference example, you can refer to the repository structure of [stabilityai/japanese-stable-diffusion-xl](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl/), that makes use of the `trust_remote_code` feature:
|
||||
As an additional reference, take a look at the repository structure of [stabilityai/japanese-stable-diffusion-xl](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl/) which also uses the `trust_remote_code` feature.
|
||||
|
||||
```python
|
||||
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
@@ -232,12 +382,4 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/japanese-stable-diffusion-xl", trust_remote_code=True
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
|
||||
# if using torch < 2.0
|
||||
# pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
prompt = "柴犬、カラフルアート"
|
||||
|
||||
image = pipeline(prompt=prompt).images[0]
|
||||
|
||||
```
|
||||
```
|
||||
|
||||
@@ -1,135 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Improve generation quality with FreeU
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
The UNet is responsible for denoising during the reverse diffusion process, and there are two distinct features in its architecture:
|
||||
|
||||
1. Backbone features primarily contribute to the denoising process
|
||||
2. Skip features mainly introduce high-frequency features into the decoder module and can make the network overlook the semantics in the backbone features
|
||||
|
||||
However, the skip connection can sometimes introduce unnatural image details. [FreeU](https://hf.co/papers/2309.11497) is a technique for improving image quality by rebalancing the contributions from the UNet’s skip connections and backbone feature maps.
|
||||
|
||||
FreeU is applied during inference and it does not require any additional training. The technique works for different tasks such as text-to-image, image-to-image, and text-to-video.
|
||||
|
||||
In this guide, you will apply FreeU to the [`StableDiffusionPipeline`], [`StableDiffusionXLPipeline`], and [`TextToVideoSDPipeline`]. You need to install Diffusers from source to run the examples below.
|
||||
|
||||
## StableDiffusionPipeline
|
||||
|
||||
Load the pipeline:
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Then enable the FreeU mechanism with the FreeU-specific hyperparameters. These values are scaling factors for the backbone and skip features.
|
||||
|
||||
```py
|
||||
pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
|
||||
```
|
||||
|
||||
The values above are from the official FreeU [code repository](https://github.com/ChenyangSi/FreeU) where you can also find [reference hyperparameters](https://github.com/ChenyangSi/FreeU#range-for-more-parameters) for different models.
|
||||
|
||||
<Tip>
|
||||
|
||||
Disable the FreeU mechanism by calling `disable_freeu()` on a pipeline.
|
||||
|
||||
</Tip>
|
||||
|
||||
And then run inference:
|
||||
|
||||
```py
|
||||
prompt = "A squirrel eating a burger"
|
||||
seed = 2023
|
||||
image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
The figure below compares non-FreeU and FreeU results respectively for the same hyperparameters used above (`prompt` and `seed`):
|
||||
|
||||

|
||||
|
||||
|
||||
Let's see how Stable Diffusion 2 results are impacted:
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, safety_checker=None
|
||||
).to("cuda")
|
||||
|
||||
prompt = "A squirrel eating a burger"
|
||||
seed = 2023
|
||||
|
||||
pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.1, b2=1.2)
|
||||
image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Stable Diffusion XL
|
||||
|
||||
Finally, let's take a look at how FreeU affects Stable Diffusion XL results:
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
prompt = "A squirrel eating a burger"
|
||||
seed = 2023
|
||||
|
||||
# Comes from
|
||||
# https://wandb.ai/nasirk24/UNET-FreeU-SDXL/reports/FreeU-SDXL-Optimal-Parameters--Vmlldzo1NDg4NTUw
|
||||
pipeline.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
|
||||
image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Text-to-video generation
|
||||
|
||||
FreeU can also be used to improve video quality:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import export_to_video
|
||||
import torch
|
||||
|
||||
model_id = "cerspense/zeroscope_v2_576w"
|
||||
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
prompt = "an astronaut riding a horse on mars"
|
||||
seed = 2023
|
||||
|
||||
# The values come from
|
||||
# https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines
|
||||
pipe.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2)
|
||||
video_frames = pipe(prompt, height=320, width=576, num_frames=30, generator=torch.manual_seed(seed)).frames[0]
|
||||
export_to_video(video_frames, "astronaut_rides_horse.mp4")
|
||||
```
|
||||
|
||||
Thanks to [kadirnar](https://github.com/kadirnar/) for helping to integrate the feature, and to [justindujardin](https://github.com/justindujardin) for the helpful discussions.
|
||||
190
docs/source/en/using-diffusers/image_quality.md
Normal file
190
docs/source/en/using-diffusers/image_quality.md
Normal file
@@ -0,0 +1,190 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Controlling image quality
|
||||
|
||||
The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better image lighting and details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training.
|
||||
|
||||
This guide will show you how to turn these techniques on in your pipeline and how to configure them to improve the quality of your generated images.
|
||||
|
||||
## Lighting
|
||||
|
||||
The Stable Diffusion models aren't very good at generating images that are very bright or dark because the scheduler doesn't start sampling from the last timestep and it doesn't enforce a zero signal-to-noise ratio (SNR). The [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://hf.co/papers/2305.08891) paper fixes these issues which are now available in some Diffusers schedulers.
|
||||
|
||||
> [!TIP]
|
||||
> For inference, you need a model that has been trained with *v_prediction*. To train your own model with *v_prediction*, add the following flag to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts.
|
||||
>
|
||||
> ```bash
|
||||
> --prediction_type="v_prediction"
|
||||
> ```
|
||||
|
||||
For example, load the [ptx0/pseudo-journey-v2](https://hf.co/ptx0/pseudo-journey-v2) checkpoint which was trained with `v_prediction` and the [`DDIMScheduler`]. Now you should configure the following parameters in the [`DDIMScheduler`].
|
||||
|
||||
* `rescale_betas_zero_snr=True` to rescale the noise schedule to zero SNR
|
||||
* `timestep_spacing="trailing"` to start sampling from the last timestep
|
||||
|
||||
Set `guidance_rescale` in the pipeline to prevent over-exposure. A lower value increases brightness but some of the details may appear washed out.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
|
||||
|
||||
pipeline.scheduler = DDIMScheduler.from_config(
|
||||
pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
prompt = "cinematic photo of a snowy mountain at night with the northern lights aurora borealis overhead, 35mm photograph, film, professional, 4k, highly detailed"
|
||||
generator = torch.Generator(device="cpu").manual_seed(23)
|
||||
image = pipeline(prompt, guidance_rescale=0.7, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/no-zero-snr.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">default Stable Diffusion v2-1 image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero-snr.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">image with zero SNR and trailing timestep spacing enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Details
|
||||
|
||||
[FreeU](https://hf.co/papers/2309.11497) improves image details by rebalancing the UNet's backbone and skip connection weights. The skip connections can cause the model to overlook some of the backbone semantics which may lead to unnatural image details in the generated image. This technique does not require any additional training and can be applied on the fly during inference for tasks like image-to-image and text-to-video.
|
||||
|
||||
Use the [`~pipelines.StableDiffusionMixin.enable_freeu`] method on your pipeline and configure the scaling factors for the backbone (`b1` and `b2`) and skip connections (`s1` and `s2`). The number after each scaling factor corresponds to the stage in the UNet where the factor is applied. Take a look at the [FreeU](https://github.com/ChenyangSi/FreeU#parameters) repository for reference hyperparameters for different models.
|
||||
|
||||
<hfoptions id="freeu">
|
||||
<hfoption id="Stable Diffusion v1-5">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None
|
||||
).to("cuda")
|
||||
pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.5, b2=1.6)
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
prompt = ""
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv15-no-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv15-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Stable Diffusion v2-1">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, safety_checker=None
|
||||
).to("cuda")
|
||||
pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.4, b2=1.6)
|
||||
generator = torch.Generator(device="cpu").manual_seed(80)
|
||||
prompt = "A squirrel eating a burger"
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv21-no-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv21-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Stable Diffusion XL">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.3, b2=1.4)
|
||||
generator = torch.Generator(device="cpu").manual_seed(13)
|
||||
prompt = "A squirrel eating a burger"
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-no-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Zeroscope">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
# values come from https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines
|
||||
pipeline.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2)
|
||||
prompt = "Confident teddy bear surfer rides the wave in the tropics"
|
||||
generator = torch.Generator(device="cpu").manual_seed(47)
|
||||
video_frames = pipeline(prompt, generator=generator).frames[0]
|
||||
export_to_video(video_frames, "teddy_bear.mp4", fps=10)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/video-no-freeu.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/video-freeu.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Call the [`pipelines.StableDiffusionMixin.disable_freeu`] method to disable FreeU.
|
||||
|
||||
```py
|
||||
pipeline.disable_freeu()
|
||||
```
|
||||
@@ -277,7 +277,7 @@ images = pipeline(
|
||||
|
||||
### IP-Adapter masking
|
||||
|
||||
Binary masks specify which portion of the output image should be assigned to an IP-Adapter. This is useful for composing more than one IP-Adapter image. For each input IP-Adapter image, you must provide a binary mask an an IP-Adapter.
|
||||
Binary masks specify which portion of the output image should be assigned to an IP-Adapter. This is useful for composing more than one IP-Adapter image. For each input IP-Adapter image, you must provide a binary mask.
|
||||
|
||||
To start, preprocess the input IP-Adapter images with the [`~image_processor.IPAdapterMaskProcessor.preprocess()`] to generate their masks. For optimal results, provide the output height and width to [`~image_processor.IPAdapterMaskProcessor.preprocess()`]. This ensures masks with different aspect ratios are appropriately stretched. If the input masks already match the aspect ratio of the generated image, you don't have to set the `height` and `width`.
|
||||
|
||||
@@ -305,13 +305,18 @@ masks = processor.preprocess([mask1, mask2], height=output_height, width=output_
|
||||
</div>
|
||||
</div>
|
||||
|
||||
When there is more than one input IP-Adapter image, load them as a list to ensure each image is assigned to a different IP-Adapter. Each of the input IP-Adapter images here correspond to the masks generated above.
|
||||
When there is more than one input IP-Adapter image, load them as a list and provide the IP-Adapter scale list. Each of the input IP-Adapter images here corresponds to one of the masks generated above.
|
||||
|
||||
```py
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"])
|
||||
pipeline.set_ip_adapter_scale([[0.7, 0.7]]) # one scale for each image-mask pair
|
||||
|
||||
face_image1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png")
|
||||
face_image2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl2.png")
|
||||
|
||||
ip_images = [[face_image1], [face_image2]]
|
||||
ip_images = [[face_image1, face_image2]]
|
||||
|
||||
masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])]
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
@@ -328,8 +333,6 @@ ip_images = [[face_image1], [face_image2]]
|
||||
Now pass the preprocessed masks to `cross_attention_kwargs` in the pipeline call.
|
||||
|
||||
```py
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] * 2)
|
||||
pipeline.set_ip_adapter_scale([0.7] * 2)
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
num_images = 1
|
||||
|
||||
@@ -362,14 +365,12 @@ IP-Adapter's image prompting and compatibility with other adapters and models ma
|
||||
|
||||
### Face model
|
||||
|
||||
Generating accurate faces is challenging because they are complex and nuanced. Diffusers supports two IP-Adapter checkpoints specifically trained to generate faces:
|
||||
Generating accurate faces is challenging because they are complex and nuanced. Diffusers supports two IP-Adapter checkpoints specifically trained to generate faces from the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) repository:
|
||||
|
||||
* [ip-adapter-full-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-full-face_sd15.safetensors) is conditioned with images of cropped faces and removed backgrounds
|
||||
* [ip-adapter-plus-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-plus-face_sd15.safetensors) uses patch embeddings and is conditioned with images of cropped faces
|
||||
|
||||
> [!TIP]
|
||||
>
|
||||
> [IP-Adapter-FaceID](https://huggingface.co/h94/IP-Adapter-FaceID) is a face-specific IP-Adapter trained with face ID embeddings instead of CLIP image embeddings, allowing you to generate more consistent faces in different contexts and styles. Try out this popular [community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community#ip-adapter-face-id) and see how it compares to the other face IP-Adapters.
|
||||
Additionally, Diffusers supports all IP-Adapter checkpoints trained with face embeddings extracted by `insightface` face models. Supported models are from the [h94/IP-Adapter-FaceID](https://huggingface.co/h94/IP-Adapter-FaceID) repository.
|
||||
|
||||
For face models, use the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) checkpoint. It is also recommended to use [`DDIMScheduler`] or [`EulerDiscreteScheduler`] for face models.
|
||||
|
||||
@@ -411,6 +412,71 @@ image
|
||||
</div>
|
||||
</div>
|
||||
|
||||
To use IP-Adapter FaceID models, first extract face embeddings with `insightface`. Then pass the list of tensors to the pipeline as `ip_adapter_image_embeds`.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
from insightface.app import FaceAnalysis
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid_sd15.bin", image_encoder_folder=None)
|
||||
pipeline.set_ip_adapter_scale(0.6)
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png")
|
||||
|
||||
ref_images_embeds = []
|
||||
app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
|
||||
app.prepare(ctx_id=0, det_size=(640, 640))
|
||||
image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
|
||||
faces = app.get(image)
|
||||
image = torch.from_numpy(faces[0].normed_embedding)
|
||||
ref_images_embeds.append(image.unsqueeze(0))
|
||||
ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
|
||||
neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
|
||||
id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(42)
|
||||
|
||||
images = pipeline(
|
||||
prompt="A photo of a girl",
|
||||
ip_adapter_image_embeds=[id_embeds],
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=20, num_images_per_prompt=1,
|
||||
generator=generator
|
||||
).images
|
||||
```
|
||||
|
||||
Both IP-Adapter FaceID Plus and Plus v2 models require CLIP image embeddings. You can prepare face embeddings as shown previously, then you can extract and pass CLIP embeddings to the hidden image projection layers.
|
||||
|
||||
```py
|
||||
from insightface.utils import face_align
|
||||
|
||||
ref_images_embeds = []
|
||||
ip_adapter_images = []
|
||||
app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
|
||||
app.prepare(ctx_id=0, det_size=(640, 640))
|
||||
image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
|
||||
faces = app.get(image)
|
||||
ip_adapter_images.append(face_align.norm_crop(image, landmark=faces[0].kps, image_size=224))
|
||||
image = torch.from_numpy(faces[0].normed_embedding)
|
||||
ref_images_embeds.append(image.unsqueeze(0))
|
||||
ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
|
||||
neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
|
||||
id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")
|
||||
|
||||
clip_embeds = pipeline.prepare_ip_adapter_image_embeds(
|
||||
[ip_adapter_images], None, torch.device("cuda"), num_images, True)[0]
|
||||
|
||||
pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16)
|
||||
pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = False # True if Plus v2
|
||||
```
|
||||
|
||||
### Multi IP-Adapter
|
||||
|
||||
More than one IP-Adapter can be used at the same time to generate specific images in more diverse styles. For example, you can use IP-Adapter-Face to generate consistent faces and characters, and IP-Adapter Plus to generate those faces in a specific style.
|
||||
@@ -592,3 +658,87 @@ image
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ipa-controlnet-out.png" />
|
||||
</div>
|
||||
|
||||
### Style & layout control
|
||||
|
||||
[InstantStyle](https://arxiv.org/abs/2404.02733) is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This way, you can generate images following only the style or layout from image prompt, with significantly improved diversity. This is achieved by only activating IP-Adapters to specific parts of the model.
|
||||
|
||||
By default IP-Adapters are inserted to all layers of the model. Use the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method with a dictionary to assign scales to IP-Adapter at different layers.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
|
||||
|
||||
scale = {
|
||||
"down": {"block_2": [0.0, 1.0]},
|
||||
"up": {"block_0": [0.0, 1.0, 0.0]},
|
||||
}
|
||||
pipeline.set_ip_adapter_scale(scale)
|
||||
```
|
||||
|
||||
This will activate IP-Adapter at the second layer in the model's down-part block 2 and up-part block 0. The former is the layer where IP-Adapter injects layout information and the latter injects style. Inserting IP-Adapter to these two layers you can generate images following both the style and layout from image prompt, but with contents more aligned to text prompt.
|
||||
|
||||
```py
|
||||
style_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(26)
|
||||
image = pipeline(
|
||||
prompt="a cat, masterpiece, best quality, high quality",
|
||||
ip_adapter_image=style_image,
|
||||
negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
|
||||
guidance_scale=5,
|
||||
num_inference_steps=30,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
In contrast, inserting IP-Adapter to all layers will often generate images that overly focus on image prompt and diminish diversity.
|
||||
|
||||
Activate IP-Adapter only in the style layer and then call the pipeline again.
|
||||
|
||||
```py
|
||||
scale = {
|
||||
"up": {"block_0": [0.0, 1.0, 0.0]},
|
||||
}
|
||||
pipeline.set_ip_adapter_scale(scale)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(26)
|
||||
image = pipeline(
|
||||
prompt="a cat, masterpiece, best quality, high quality",
|
||||
ip_adapter_image=style_image,
|
||||
negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
|
||||
guidance_scale=5,
|
||||
num_inference_steps=30,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_only.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter only in style layer</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_ip_adapter.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter in all layers</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Note that you don't have to specify all layers in the dictionary. Those not included in the dictionary will be set to scale 0 which means disable IP-Adapter by default.
|
||||
|
||||
@@ -10,316 +10,397 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Load pipelines, models, and schedulers
|
||||
# Load pipelines
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Having an easy way to use a diffusion system for inference is essential to 🧨 Diffusers. Diffusion systems often consist of multiple components like parameterized models, tokenizers, and schedulers that interact in complex ways. That is why we designed the [`DiffusionPipeline`] to wrap the complexity of the entire diffusion system into an easy-to-use API, while remaining flexible enough to be adapted for other use cases, such as loading each component individually as building blocks to assemble your own diffusion system.
|
||||
|
||||
Everything you need for inference or training is accessible with the `from_pretrained()` method.
|
||||
Diffusion systems consist of multiple components like parameterized models and schedulers that interact in complex ways. That is why we designed the [`DiffusionPipeline`] to wrap the complexity of the entire diffusion system into an easy-to-use API. At the same time, the [`DiffusionPipeline`] is entirely customizable so you can modify each component to build a diffusion system for your use case.
|
||||
|
||||
This guide will show you how to load:
|
||||
|
||||
- pipelines from the Hub and locally
|
||||
- different components into a pipeline
|
||||
- multiple pipelines without increasing memory usage
|
||||
- checkpoint variants such as different floating point types or non-exponential mean averaged (EMA) weights
|
||||
- models and schedulers
|
||||
|
||||
## Diffusion Pipeline
|
||||
## Load a pipeline
|
||||
|
||||
<Tip>
|
||||
> [!TIP]
|
||||
> Skip to the [DiffusionPipeline explained](#diffusionpipeline-explained) section if you're interested in an explanation about how the [`DiffusionPipeline`] class works.
|
||||
|
||||
💡 Skip to the [DiffusionPipeline explained](#diffusionpipeline-explained) section if you are interested in learning in more detail about how the [`DiffusionPipeline`] class works.
|
||||
There are two ways to load a pipeline for a task:
|
||||
|
||||
</Tip>
|
||||
1. Load the generic [`DiffusionPipeline`] class and allow it to automatically detect the correct pipeline class from the checkpoint.
|
||||
2. Load a specific pipeline class for a specific task.
|
||||
|
||||
The [`DiffusionPipeline`] class is the simplest and most generic way to load the latest trending diffusion model from the [Hub](https://huggingface.co/models?library=diffusers&sort=trending). The [`DiffusionPipeline.from_pretrained`] method automatically detects the correct pipeline class from the checkpoint, downloads, and caches all the required configuration and weight files, and returns a pipeline instance ready for inference.
|
||||
<hfoptions id="pipelines">
|
||||
<hfoption id="generic pipeline">
|
||||
|
||||
The [`DiffusionPipeline`] class is a simple and generic way to load the latest trending diffusion model from the [Hub](https://huggingface.co/models?library=diffusers&sort=trending). It uses the [`~DiffusionPipeline.from_pretrained`] method to automatically detect the correct pipeline class for a task from the checkpoint, downloads and caches all the required configuration and weight files, and returns a pipeline ready for inference.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
pipe = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
|
||||
pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
You can also load a checkpoint with its specific pipeline class. The example above loaded a Stable Diffusion model; to get the same result, use the [`StableDiffusionPipeline`] class:
|
||||
This same checkpoint can also be used for an image-to-image task. The [`DiffusionPipeline`] class can handle any task as long as you provide the appropriate inputs. For example, for an image-to-image task, you need to pass an initial image to the pipeline.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
|
||||
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", image=init_image).images[0]
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="specific pipeline">
|
||||
|
||||
Checkpoints can be loaded by their specific pipeline class if you already know it. For example, to load a Stable Diffusion model, use the [`StableDiffusionPipeline`] class.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
A checkpoint (such as [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) or [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) may also be used for more than one task, like text-to-image or image-to-image. To differentiate what task you want to use the checkpoint for, you have to load it directly with its corresponding task-specific pipeline class:
|
||||
This same checkpoint may also be used for another task like image-to-image. To differentiate what task you want to use the checkpoint for, you have to use the corresponding task-specific pipeline class. For example, to use the same checkpoint for image-to-image, use the [`StableDiffusionImg2ImgPipeline`] class.
|
||||
|
||||
```python
|
||||
```py
|
||||
from diffusers import StableDiffusionImg2ImgPipeline
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(repo_id)
|
||||
pipeline = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Use the Space below to gauge a pipeline's memory requirements before you download and load it to see if it runs on your hardware.
|
||||
|
||||
<div class="block dark:hidden">
|
||||
<iframe
|
||||
src="https://diffusers-compute-pipeline-size.hf.space?__theme=light"
|
||||
width="850"
|
||||
height="1600"
|
||||
></iframe>
|
||||
</div>
|
||||
<div class="hidden dark:block">
|
||||
<iframe
|
||||
src="https://diffusers-compute-pipeline-size.hf.space?__theme=dark"
|
||||
width="850"
|
||||
height="1600"
|
||||
></iframe>
|
||||
</div>
|
||||
|
||||
### Local pipeline
|
||||
|
||||
To load a diffusion pipeline locally, use [`git-lfs`](https://git-lfs.github.com/) to manually download the checkpoint (in this case, [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) to your local disk. This creates a local folder, `./stable-diffusion-v1-5`, on your disk:
|
||||
To load a pipeline locally, use [git-lfs](https://git-lfs.github.com/) to manually download a checkpoint to your local disk.
|
||||
|
||||
```bash
|
||||
git-lfs install
|
||||
git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
|
||||
```
|
||||
|
||||
Then pass the local path to [`~DiffusionPipeline.from_pretrained`]:
|
||||
This creates a local folder, ./stable-diffusion-v1-5, on your disk and you should pass its path to [`~DiffusionPipeline.from_pretrained`].
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
repo_id = "./stable-diffusion-v1-5"
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
The [`~DiffusionPipeline.from_pretrained`] method won't download any files from the Hub when it detects a local path, but this also means it won't download and cache the latest changes to a checkpoint.
|
||||
The [`~DiffusionPipeline.from_pretrained`] method won't download files from the Hub when it detects a local path, but this also means it won't download and cache the latest changes to a checkpoint.
|
||||
|
||||
### Swap components in a pipeline
|
||||
## Customize a pipeline
|
||||
|
||||
You can customize the default components of any pipeline with another compatible component. Customization is important because:
|
||||
You can customize a pipeline by loading different components into it. This is important because you can:
|
||||
|
||||
- Changing the scheduler is important for exploring the trade-off between generation speed and quality.
|
||||
- Different components of a model are typically trained independently and you can swap out a component with a better-performing one.
|
||||
- During finetuning, usually only some components - like the UNet or text encoder - are trained.
|
||||
- change to a scheduler with faster generation speed or higher generation quality depending on your needs (call the `scheduler.compatibles` method on your pipeline to see compatible schedulers)
|
||||
- change a default pipeline component to a newer and better performing one
|
||||
|
||||
To find out which schedulers are compatible for customization, you can use the `compatibles` method:
|
||||
For example, let's customize the default [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0) checkpoint with:
|
||||
|
||||
- The [`HeunDiscreteScheduler`] to generate higher quality images at the expense of slower generation speed. You must pass the `subfolder="scheduler"` parameter in [`~HeunDiscreteScheduler.from_pretrained`] to load the scheduler configuration into the correct [subfolder](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0/tree/main/scheduler) of the pipeline repository.
|
||||
- A more stable VAE that runs in fp16.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers import StableDiffusionXLPipeline, HeunDiscreteScheduler, AutoencoderKL
|
||||
import torch
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
|
||||
stable_diffusion.scheduler.compatibles
|
||||
scheduler = HeunDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
|
||||
```
|
||||
|
||||
Let's use the [`SchedulerMixin.from_pretrained`] method to replace the default [`PNDMScheduler`] with a more performant scheduler, [`EulerDiscreteScheduler`]. The `subfolder="scheduler"` argument is required to load the scheduler configuration from the correct [subfolder](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/scheduler) of the pipeline repository.
|
||||
Now pass the new scheduler and VAE to the [`StableDiffusionXLPipeline`].
|
||||
|
||||
Then you can pass the new [`EulerDiscreteScheduler`] instance to the `scheduler` argument in [`DiffusionPipeline`]:
|
||||
```py
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
scheduler=scheduler,
|
||||
vae=vae,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
## Reuse a pipeline
|
||||
|
||||
When you load multiple pipelines that share the same model components, it makes sense to reuse the shared components instead of reloading everything into memory again, especially if your hardware is memory-constrained. For example:
|
||||
|
||||
1. You generated an image with the [`StableDiffusionPipeline`] but you want to improve its quality with the [`StableDiffusionSAGPipeline`]. Both of these pipelines share the same pretrained model, so it'd be a waste of memory to load the same model twice.
|
||||
2. You want to add a model component, like a [`MotionAdapter`](../api/pipelines/animatediff#animatediffpipeline), to [`AnimateDiffPipeline`] which was instantiated from an existing [`StableDiffusionPipeline`]. Again, both pipelines share the same pretrained model, so it'd be a waste of memory to load an entirely new pipeline again.
|
||||
|
||||
With the [`DiffusionPipeline.from_pipe`] API, you can switch between multiple pipelines to take advantage of their different features without increasing memory-usage. It is similar to turning on and off a feature in your pipeline.
|
||||
|
||||
> [!TIP]
|
||||
> To switch between tasks (rather than features), use the [`~DiffusionPipeline.from_pipe`] method with the [AutoPipeline](../api/pipelines/auto_pipeline) class, which automatically identifies the pipeline class based on the task (learn more in the [AutoPipeline](../tutorials/autopipeline) tutorial).
|
||||
|
||||
Let's start with a [`StableDiffusionPipeline`] and then reuse the loaded model components to create a [`StableDiffusionSAGPipeline`] to increase generation quality. You'll use the [`StableDiffusionPipeline`] with an [IP-Adapter](./ip_adapter) to generate a bear eating pizza.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline, EulerDiscreteScheduler
|
||||
from diffusers import DiffusionPipeline, StableDiffusionSAGPipeline
|
||||
import torch
|
||||
import gc
|
||||
from diffusers.utils import load_image
|
||||
from accelerate.utils import compute_module_sizes
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
scheduler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, scheduler=scheduler, use_safetensors=True)
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png")
|
||||
|
||||
pipe_sd = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", torch_dtype=torch.float16)
|
||||
pipe_sd.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
pipe_sd.set_ip_adapter_scale(0.6)
|
||||
pipe_sd.to("cuda")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
out_sd = pipe_sd(
|
||||
prompt="bear eats pizza",
|
||||
negative_prompt="wrong white balance, dark, sketches,worst quality,low quality",
|
||||
ip_adapter_image=image,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
out_sd
|
||||
```
|
||||
|
||||
### Safety checker
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_sd_0.png"/>
|
||||
</div>
|
||||
|
||||
Diffusion models like Stable Diffusion can generate harmful content, which is why 🧨 Diffusers has a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) to check generated outputs against known hardcoded NSFW content. If you'd like to disable the safety checker for whatever reason, pass `None` to the `safety_checker` argument:
|
||||
For reference, you can check how much memory this process consumed.
|
||||
|
||||
```python
|
||||
def bytes_to_giga_bytes(bytes):
|
||||
return bytes / 1024 / 1024 / 1024
|
||||
print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
|
||||
"Max memory allocated: 4.406213283538818 GB"
|
||||
```
|
||||
|
||||
Now, reuse the same pipeline components from [`StableDiffusionPipeline`] in [`StableDiffusionSAGPipeline`] with the [`~DiffusionPipeline.from_pipe`] method.
|
||||
|
||||
> [!WARNING]
|
||||
> Some pipeline methods may not function properly on new pipelines created with [`~DiffusionPipeline.from_pipe`]. For instance, the [`~DiffusionPipeline.enable_model_cpu_offload`] method installs hooks on the model components based on a unique offloading sequence for each pipeline. If the models are executed in a different order in the new pipeline, the CPU offloading may not work correctly.
|
||||
>
|
||||
> To ensure everything works as expected, we recommend re-applying a pipeline method on a new pipeline created with [`~DiffusionPipeline.from_pipe`].
|
||||
|
||||
```python
|
||||
pipe_sag = StableDiffusionSAGPipeline.from_pipe(
|
||||
pipe_sd
|
||||
)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
out_sag = pipe_sag(
|
||||
prompt="bear eats pizza",
|
||||
negative_prompt="wrong white balance, dark, sketches,worst quality,low quality",
|
||||
ip_adapter_image=image,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
guidance_scale=1.0,
|
||||
sag_scale=0.75
|
||||
).images[0]
|
||||
out_sag
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_sag_1.png"/>
|
||||
</div>
|
||||
|
||||
If you check the memory usage, you'll see it remains the same as before because [`StableDiffusionPipeline`] and [`StableDiffusionSAGPipeline`] are sharing the same pipeline components. This allows you to use them interchangeably without any additional memory overhead.
|
||||
|
||||
```py
|
||||
print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
|
||||
"Max memory allocated: 4.406213283538818 GB"
|
||||
```
|
||||
|
||||
Let's animate the image with the [`AnimateDiffPipeline`] and also add a [`MotionAdapter`] module to the pipeline. For the [`AnimateDiffPipeline`], you need to unload the IP-Adapter first and reload it *after* you've created your new pipeline (this only applies to the [`AnimateDiffPipeline`]).
|
||||
|
||||
```py
|
||||
from diffusers import AnimateDiffPipeline, MotionAdapter, DDIMScheduler
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
pipe_sag.unload_ip_adapter()
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
|
||||
|
||||
pipe_animate = AnimateDiffPipeline.from_pipe(pipe_sd, motion_adapter=adapter)
|
||||
pipe_animate.scheduler = DDIMScheduler.from_config(pipe_animate.scheduler.config, beta_schedule="linear")
|
||||
# load IP-Adapter and LoRA weights again
|
||||
pipe_animate.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
pipe_animate.load_lora_weights("guoyww/animatediff-motion-lora-zoom-out", adapter_name="zoom-out")
|
||||
pipe_animate.to("cuda")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
pipe_animate.set_adapters("zoom-out", adapter_weights=0.75)
|
||||
out = pipe_animate(
|
||||
prompt="bear eats pizza",
|
||||
num_frames=16,
|
||||
num_inference_steps=50,
|
||||
ip_adapter_image=image,
|
||||
generator=generator,
|
||||
).frames[0]
|
||||
export_to_gif(out, "out_animate.gif")
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_animate_3.gif"/>
|
||||
</div>
|
||||
|
||||
The [`AnimateDiffPipeline`] is more memory-intensive and consumes 15GB of memory (see the [Memory-usage of from_pipe](#memory-usage-of-from_pipe) section to learn what this means for your memory-usage).
|
||||
|
||||
```py
|
||||
print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
|
||||
"Max memory allocated: 15.178664207458496 GB"
|
||||
```
|
||||
|
||||
### Modify from_pipe components
|
||||
|
||||
Pipelines loaded with [`~DiffusionPipeline.from_pipe`] can be customized with different model components or methods. However, whenever you modify the *state* of the model components, it affects all the other pipelines that share the same components. For example, if you call [`~diffusers.loaders.IPAdapterMixin.unload_ip_adapter`] on the [`StableDiffusionSAGPipeline`], you won't be able to use IP-Adapter with the [`StableDiffusionPipeline`] because it's been removed from their shared components.
|
||||
|
||||
```py
|
||||
pipe.sag_unload_ip_adapter()
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
out_sd = pipe_sd(
|
||||
prompt="bear eats pizza",
|
||||
negative_prompt="wrong white balance, dark, sketches,worst quality,low quality",
|
||||
ip_adapter_image=image,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
"AttributeError: 'NoneType' object has no attribute 'image_projection_layers'"
|
||||
```
|
||||
|
||||
### Memory usage of from_pipe
|
||||
|
||||
The memory requirement of loading multiple pipelines with [`~DiffusionPipeline.from_pipe`] is determined by the pipeline with the highest memory-usage regardless of the number of pipelines you create.
|
||||
|
||||
| Pipeline | Memory usage (GB) |
|
||||
|---|---|
|
||||
| StableDiffusionPipeline | 4.400 |
|
||||
| StableDiffusionSAGPipeline | 4.400 |
|
||||
| AnimateDiffPipeline | 15.178 |
|
||||
|
||||
The [`AnimateDiffPipeline`] has the highest memory requirement, so the *total memory-usage* is based only on the [`AnimateDiffPipeline`]. Your memory-usage will not increase if you create additional pipelines as long as their memory requirements doesn't exceed that of the [`AnimateDiffPipeline`]. Each pipeline can be used interchangeably without any additional memory overhead.
|
||||
|
||||
## Safety checker
|
||||
|
||||
Diffusers implements a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) for Stable Diffusion models which can generate harmful content. The safety checker screens the generated output against known hardcoded not-safe-for-work (NSFW) content. If for whatever reason you'd like to disable the safety checker, pass `safety_checker=None` to the [`~DiffusionPipeline.from_pretrained`] method.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, safety_checker=None, use_safetensors=True)
|
||||
pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None, use_safetensors=True)
|
||||
"""
|
||||
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide by the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend keeping the safety filter enabled in all public-facing circumstances, disabling it only for use cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .
|
||||
"""
|
||||
```
|
||||
|
||||
### Reuse components across pipelines
|
||||
|
||||
You can also reuse the same components in multiple pipelines to avoid loading the weights into RAM twice. Use the [`~DiffusionPipeline.components`] method to save the components:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
|
||||
|
||||
model_id = "runwayml/stable-diffusion-v1-5"
|
||||
stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
|
||||
|
||||
components = stable_diffusion_txt2img.components
|
||||
```
|
||||
|
||||
Then you can pass the `components` to another pipeline without reloading the weights into RAM:
|
||||
|
||||
```py
|
||||
stable_diffusion_img2img = StableDiffusionImg2ImgPipeline(**components)
|
||||
```
|
||||
|
||||
You can also pass the components individually to the pipeline if you want more flexibility over which components to reuse or disable. For example, to reuse the same components in the text-to-image pipeline, except for the safety checker and feature extractor, in the image-to-image pipeline:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
|
||||
|
||||
model_id = "runwayml/stable-diffusion-v1-5"
|
||||
stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
|
||||
stable_diffusion_img2img = StableDiffusionImg2ImgPipeline(
|
||||
vae=stable_diffusion_txt2img.vae,
|
||||
text_encoder=stable_diffusion_txt2img.text_encoder,
|
||||
tokenizer=stable_diffusion_txt2img.tokenizer,
|
||||
unet=stable_diffusion_txt2img.unet,
|
||||
scheduler=stable_diffusion_txt2img.scheduler,
|
||||
safety_checker=None,
|
||||
feature_extractor=None,
|
||||
requires_safety_checker=False,
|
||||
)
|
||||
```
|
||||
|
||||
## Checkpoint variants
|
||||
|
||||
A checkpoint variant is usually a checkpoint whose weights are:
|
||||
|
||||
- Stored in a different floating point type for lower precision and lower storage, such as [`torch.float16`](https://pytorch.org/docs/stable/tensors.html#data-types), because it only requires half the bandwidth and storage to download. You can't use this variant if you're continuing training or using a CPU.
|
||||
- Non-exponential mean averaged (EMA) weights, which shouldn't be used for inference. You should use these to continue fine-tuning a model.
|
||||
- Stored in a different floating point type, such as [torch.float16](https://pytorch.org/docs/stable/tensors.html#data-types), because it only requires half the bandwidth and storage to download. You can't use this variant if you're continuing training or using a CPU.
|
||||
- Non-exponential mean averaged (EMA) weights which shouldn't be used for inference. You should use this variant to continue finetuning a model.
|
||||
|
||||
<Tip>
|
||||
> [!TIP]
|
||||
> When the checkpoints have identical model structures, but they were trained on different datasets and with a different training setup, they should be stored in separate repositories. For example, [stabilityai/stable-diffusion-2](https://hf.co/stabilityai/stable-diffusion-2) and [stabilityai/stable-diffusion-2-1](https://hf.co/stabilityai/stable-diffusion-2-1) are stored in separate repositories.
|
||||
|
||||
💡 When the checkpoints have identical model structures, but they were trained on different datasets and with a different training setup, they should be stored in separate repositories instead of variations (for example, [`stable-diffusion-v1-4`] and [`stable-diffusion-v1-5`]).
|
||||
Otherwise, a variant is **identical** to the original checkpoint. They have exactly the same serialization format (like [safetensors](./using_safetensors)), model structure, and their weights have identical tensor shapes.
|
||||
|
||||
</Tip>
|
||||
| **checkpoint type** | **weight name** | **argument for loading weights** |
|
||||
|---------------------|---------------------------------------------|----------------------------------|
|
||||
| original | diffusion_pytorch_model.safetensors | |
|
||||
| floating point | diffusion_pytorch_model.fp16.safetensors | `variant`, `torch_dtype` |
|
||||
| non-EMA | diffusion_pytorch_model.non_ema.safetensors | `variant` |
|
||||
|
||||
Otherwise, a variant is **identical** to the original checkpoint. They have exactly the same serialization format (like [Safetensors](./using_safetensors)), model structure, and weights that have identical tensor shapes.
|
||||
There are two important arguments for loading variants:
|
||||
|
||||
| **checkpoint type** | **weight name** | **argument for loading weights** |
|
||||
|---------------------|-------------------------------------|----------------------------------|
|
||||
| original | diffusion_pytorch_model.bin | |
|
||||
| floating point | diffusion_pytorch_model.fp16.bin | `variant`, `torch_dtype` |
|
||||
| non-EMA | diffusion_pytorch_model.non_ema.bin | `variant` |
|
||||
- `torch_dtype` specifies the floating point precision of the loaded checkpoint. For example, if you want to save bandwidth by loading a fp16 variant, you should set `variant="fp16"` and `torch_dtype=torch.float16` to *convert the weights* to fp16. Otherwise, the fp16 weights are converted to the default fp32 precision.
|
||||
|
||||
There are two important arguments to know for loading variants:
|
||||
If you only set `torch_dtype=torch.float16`, the default fp32 weights are downloaded first and then converted to fp16.
|
||||
|
||||
- `torch_dtype` defines the floating point precision of the loaded checkpoints. For example, if you want to save bandwidth by loading a `fp16` variant, you should specify `torch_dtype=torch.float16` to *convert the weights* to `fp16`. Otherwise, the `fp16` weights are converted to the default `fp32` precision. You can also load the original checkpoint without defining the `variant` argument, and convert it to `fp16` with `torch_dtype=torch.float16`. In this case, the default `fp32` weights are downloaded first, and then they're converted to `fp16` after loading.
|
||||
- `variant` specifies which files should be loaded from the repository. For example, if you want to load a non-EMA variant of a UNet from [runwayml/stable-diffusion-v1-5](https://hf.co/runwayml/stable-diffusion-v1-5/tree/main/unet), set `variant="non_ema"` to download the `non_ema` file.
|
||||
|
||||
- `variant` defines which files should be loaded from the repository. For example, if you want to load a `non_ema` variant from the [`diffusers/stable-diffusion-variants`](https://huggingface.co/diffusers/stable-diffusion-variants/tree/main/unet) repository, you should specify `variant="non_ema"` to download the `non_ema` files.
|
||||
<hfoptions id="variants">
|
||||
<hfoption id="fp16">
|
||||
|
||||
```python
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
# load fp16 variant
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
# load non_ema variant
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="non-EMA">
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", variant="non_ema", use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
To save a checkpoint stored in a different floating-point type or as a non-EMA variant, use the [`DiffusionPipeline.save_pretrained`] method and specify the `variant` argument. You should try and save a variant to the same folder as the original checkpoint, so you can load both from the same folder:
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Use the `variant` parameter in the [`DiffusionPipeline.save_pretrained`] method to save a checkpoint as a different floating point type or as a non-EMA variant. You should try save a variant to the same folder as the original checkpoint, so you have the option of loading both from the same folder.
|
||||
|
||||
<hfoptions id="save">
|
||||
<hfoption id="fp16">
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
# save as fp16 variant
|
||||
stable_diffusion.save_pretrained("runwayml/stable-diffusion-v1-5", variant="fp16")
|
||||
# save as non-ema variant
|
||||
stable_diffusion.save_pretrained("runwayml/stable-diffusion-v1-5", variant="non_ema")
|
||||
pipeline.save_pretrained("runwayml/stable-diffusion-v1-5", variant="fp16")
|
||||
```
|
||||
|
||||
If you don't save the variant to an existing folder, you must specify the `variant` argument otherwise it'll throw an `Exception` because it can't find the original checkpoint:
|
||||
</hfoption>
|
||||
<hfoption id="non_ema">
|
||||
|
||||
```py
|
||||
pipeline.save_pretrained("runwayml/stable-diffusion-v1-5", variant="non_ema")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
If you don't save the variant to an existing folder, you must specify the `variant` argument otherwise it'll throw an `Exception` because it can't find the original checkpoint.
|
||||
|
||||
```python
|
||||
# 👎 this won't work
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"./stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
# 👍 this works
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"./stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
<!--
|
||||
TODO(Patrick) - Make sure to uncomment this part as soon as things are deprecated.
|
||||
|
||||
#### Using `revision` to load pipeline variants is deprecated
|
||||
|
||||
Previously the `revision` argument of [`DiffusionPipeline.from_pretrained`] was heavily used to
|
||||
load model variants, e.g.:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", use_safetensors=True)
|
||||
```
|
||||
|
||||
However, this behavior is now deprecated since the "revision" argument should (just as it's done in GitHub) better be used to load model checkpoints from a specific commit or branch in development.
|
||||
|
||||
The above example is therefore deprecated and won't be supported anymore for `diffusers >= 1.0.0`.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
If you load diffusers pipelines or models with `revision="fp16"` or `revision="non_ema"`,
|
||||
please make sure to update the code and use `variant="fp16"` or `variation="non_ema"` respectively
|
||||
instead.
|
||||
|
||||
</Tip>
|
||||
-->
|
||||
|
||||
## Models
|
||||
|
||||
Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them.
|
||||
|
||||
Models can be loaded from a subfolder with the `subfolder` argument. For example, the model weights for `runwayml/stable-diffusion-v1-5` are stored in the [`unet`](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/unet) subfolder:
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DConditionModel
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
model = UNet2DConditionModel.from_pretrained(repo_id, subfolder="unet", use_safetensors=True)
|
||||
```
|
||||
|
||||
Or directly from a repository's [directory](https://huggingface.co/google/ddpm-cifar10-32/tree/main):
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DModel
|
||||
|
||||
repo_id = "google/ddpm-cifar10-32"
|
||||
model = UNet2DModel.from_pretrained(repo_id, use_safetensors=True)
|
||||
```
|
||||
|
||||
You can also load and save model variants by specifying the `variant` argument in [`ModelMixin.from_pretrained`] and [`ModelMixin.save_pretrained`]:
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DConditionModel
|
||||
|
||||
model = UNet2DConditionModel.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", subfolder="unet", variant="non_ema", use_safetensors=True
|
||||
)
|
||||
model.save_pretrained("./local-unet", variant="non_ema")
|
||||
```
|
||||
|
||||
## Schedulers
|
||||
|
||||
Schedulers are loaded from the [`SchedulerMixin.from_pretrained`] method, and unlike models, schedulers are **not parameterized** or **trained**; they are defined by a configuration file.
|
||||
|
||||
Loading schedulers does not consume any significant amount of memory and the same configuration file can be used for a variety of different schedulers.
|
||||
For example, the following schedulers are compatible with [`StableDiffusionPipeline`], which means you can load the same scheduler configuration file in any of these classes:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from diffusers import (
|
||||
DDPMScheduler,
|
||||
DDIMScheduler,
|
||||
PNDMScheduler,
|
||||
LMSDiscreteScheduler,
|
||||
EulerAncestralDiscreteScheduler,
|
||||
EulerDiscreteScheduler,
|
||||
DPMSolverMultistepScheduler,
|
||||
)
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
|
||||
ddpm = DDPMScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
ddim = DDIMScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
pndm = PNDMScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
lms = LMSDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
euler_anc = EulerAncestralDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
euler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
dpm = DPMSolverMultistepScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
|
||||
# replace `dpm` with any of `ddpm`, `ddim`, `pndm`, `lms`, `euler_anc`, `euler`
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(repo_id, scheduler=dpm, use_safetensors=True)
|
||||
```
|
||||
|
||||
## DiffusionPipeline explained
|
||||
|
||||
As a class method, [`DiffusionPipeline.from_pretrained`] is responsible for two things:
|
||||
|
||||
@@ -153,18 +153,43 @@ image
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_attn_proc.png" />
|
||||
</div>
|
||||
|
||||
<Tip>
|
||||
|
||||
For both [`~loaders.LoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA.
|
||||
|
||||
</Tip>
|
||||
|
||||
To unload the LoRA weights, use the [`~loaders.LoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights:
|
||||
|
||||
```py
|
||||
pipeline.unload_lora_weights()
|
||||
```
|
||||
|
||||
### Adjust LoRA weight scale
|
||||
|
||||
For both [`~loaders.LoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA.
|
||||
|
||||
For more granular control on the amount of LoRA weights used per layer, you can use [`~loaders.LoraLoaderMixin.set_adapters`] and pass a dictionary specifying by how much to scale the weights in each layer by.
|
||||
```python
|
||||
pipe = ... # create pipeline
|
||||
pipe.load_lora_weights(..., adapter_name="my_adapter")
|
||||
scales = {
|
||||
"text_encoder": 0.5,
|
||||
"text_encoder_2": 0.5, # only usable if pipe has a 2nd text encoder
|
||||
"unet": {
|
||||
"down": 0.9, # all transformers in the down-part will use scale 0.9
|
||||
# "mid" # in this example "mid" is not given, therefore all transformers in the mid part will use the default scale 1.0
|
||||
"up": {
|
||||
"block_0": 0.6, # all 3 transformers in the 0th block in the up-part will use scale 0.6
|
||||
"block_1": [0.4, 0.8, 1.0], # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
|
||||
}
|
||||
}
|
||||
}
|
||||
pipe.set_adapters("my_adapter", scales)
|
||||
```
|
||||
|
||||
This also works with multiple adapters - see [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#customize-adapters-strength) for how to do it.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Currently, [`~loaders.LoraLoaderMixin.set_adapters`] only supports scaling attention weights. If a LoRA has other parts (e.g., resnets or down-/upsamplers), they will keep a scale of 1.0.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Kohya and TheLastBen
|
||||
|
||||
Other popular LoRA trainers from the community include those by [Kohya](https://github.com/kohya-ss/sd-scripts/) and [TheLastBen](https://github.com/TheLastBen/fast-stable-diffusion). These trainers create different LoRA checkpoints than those trained by 🤗 Diffusers, but they can still be loaded in the same way.
|
||||
@@ -295,3 +320,40 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus_sdxl_vit-h.safetensors")
|
||||
```
|
||||
|
||||
### IP-Adapter Face ID models
|
||||
|
||||
The IP-Adapter FaceID models are experimental IP Adapters that use image embeddings generated by `insightface` instead of CLIP image embeddings. Some of these models also use LoRA to improve ID consistency.
|
||||
You need to install `insightface` and all its requirements to use these models.
|
||||
|
||||
<Tip warning={true}>
|
||||
As InsightFace pretrained models are available for non-commercial research purposes, IP-Adapter-FaceID models are released exclusively for research purposes and are not intended for commercial use.
|
||||
</Tip>
|
||||
|
||||
```py
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid_sdxl.bin", image_encoder_folder=None)
|
||||
```
|
||||
|
||||
If you want to use one of the two IP-Adapter FaceID Plus models, you must also load the CLIP image encoder, as this models use both `insightface` and CLIP image embeddings to achieve better photorealism.
|
||||
|
||||
```py
|
||||
from transformers import CLIPVisionModelWithProjection
|
||||
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
||||
"laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
image_encoder=image_encoder,
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid-plus_sd15.bin")
|
||||
```
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Overview
|
||||
|
||||
🧨 Diffusers offers many pipelines, models, and schedulers for generative tasks. To make loading these components as simple as possible, we provide a single and unified method - `from_pretrained()` - that loads any of these components from either the Hugging Face [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) or your local machine. Whenever you load a pipeline or model, the latest files are automatically downloaded and cached so you can quickly reuse them next time without redownloading the files.
|
||||
|
||||
This section will show you everything you need to know about loading pipelines, how to load different components in a pipeline, how to load checkpoint variants, and how to load community pipelines. You'll also learn how to load schedulers and compare the speed and quality trade-offs of using different schedulers. Finally, you'll see how to convert and load KerasCV checkpoints so you can use them in PyTorch with 🧨 Diffusers.
|
||||
@@ -1,21 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Using Diffusers with other modalities
|
||||
|
||||
Diffusers is in the process of expanding to modalities other than images.
|
||||
|
||||
Example type | Colab | Pipeline |
|
||||
:-------------------------:|:-------------------------:|:-------------------------:|
|
||||
[Molecule conformation](https://www.nature.com/subjects/molecular-conformation#:~:text=Definition,to%20changes%20in%20their%20environment.) generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb) | ❌
|
||||
|
||||
More coming soon!
|
||||
18
docs/source/en/using-diffusers/overview_techniques.md
Normal file
18
docs/source/en/using-diffusers/overview_techniques.md
Normal file
@@ -0,0 +1,18 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Overview
|
||||
|
||||
The inference pipeline supports and enables a wide range of techniques that are divided into two categories:
|
||||
|
||||
* Pipeline functionality: these techniques modify the pipeline or extend it for other applications. For example, pipeline callbacks add new features to a pipeline and a pipeline can also be extended for distributed inference.
|
||||
* Improve inference quality: these techniques increase the visual quality of the generated images. For example, you can enhance your prompts with GPT2 to create better images with lower effort.
|
||||
@@ -1,17 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Overview
|
||||
|
||||
A pipeline is an end-to-end class that provides a quick and easy way to use a diffusion system for inference by bundling independently trained models and schedulers together. Certain combinations of models and schedulers define specific pipeline types, like [`StableDiffusionXLPipeline`] or [`StableDiffusionControlNetPipeline`], with specific capabilities. All pipeline types inherit from the base [`DiffusionPipeline`] class; pass it any checkpoint, and it'll automatically detect the pipeline type and load the necessary components.
|
||||
|
||||
This section demonstrates how to use specific pipelines such as Stable Diffusion XL, ControlNet, and DiffEdit. You'll also learn how to use a distilled version of the Stable Diffusion model to speed up inference, how to create reproducible pipelines, and how to use and contribute community pipelines.
|
||||
@@ -1,191 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Create reproducible pipelines
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Reproducibility is important for testing, replicating results, and can even be used to [improve image quality](reusing_seeds). However, the randomness in diffusion models is a desired property because it allows the pipeline to generate different images every time it is run. While you can't expect to get the exact same results across platforms, you can expect results to be reproducible across releases and platforms within a certain tolerance range. Even then, tolerance varies depending on the diffusion pipeline and checkpoint.
|
||||
|
||||
This is why it's important to understand how to control sources of randomness in diffusion models or use deterministic algorithms.
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 We strongly recommend reading PyTorch's [statement about reproducibility](https://pytorch.org/docs/stable/notes/randomness.html):
|
||||
|
||||
> Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Control randomness
|
||||
|
||||
During inference, pipelines rely heavily on random sampling operations which include creating the
|
||||
Gaussian noise tensors to denoise and adding noise to the scheduling step.
|
||||
|
||||
Take a look at the tensor values in the [`DDIMPipeline`] after two inference steps:
|
||||
|
||||
```python
|
||||
from diffusers import DDIMPipeline
|
||||
import numpy as np
|
||||
|
||||
model_id = "google/ddpm-cifar10-32"
|
||||
|
||||
# load model and scheduler
|
||||
ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
|
||||
|
||||
# run pipeline for just two steps and return numpy tensor
|
||||
image = ddim(num_inference_steps=2, output_type="np").images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
Running the code above prints one value, but if you run it again you get a different value. What is going on here?
|
||||
|
||||
Every time the pipeline is run, [`torch.randn`](https://pytorch.org/docs/stable/generated/torch.randn.html) uses a different random seed to create Gaussian noise which is denoised stepwise. This leads to a different result each time it is run, which is great for diffusion pipelines since it generates a different random image each time.
|
||||
|
||||
But if you need to reliably generate the same image, that'll depend on whether you're running the pipeline on a CPU or GPU.
|
||||
|
||||
### CPU
|
||||
|
||||
To generate reproducible results on a CPU, you'll need to use a PyTorch [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed:
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import DDIMPipeline
|
||||
import numpy as np
|
||||
|
||||
model_id = "google/ddpm-cifar10-32"
|
||||
|
||||
# load model and scheduler
|
||||
ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
|
||||
|
||||
# create a generator for reproducibility
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
# run pipeline for just two steps and return numpy tensor
|
||||
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
Now when you run the code above, it always prints a value of `1491.1711` no matter what because the `Generator` object with the seed is passed to all the random functions of the pipeline.
|
||||
|
||||
If you run this code example on your specific hardware and PyTorch version, you should get a similar, if not the same, result.
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 It might be a bit unintuitive at first to pass `Generator` objects to the pipeline instead of
|
||||
just integer values representing the seed, but this is the recommended design when dealing with
|
||||
probabilistic models in PyTorch, as `Generator`s are *random states* that can be
|
||||
passed to multiple pipelines in a sequence.
|
||||
|
||||
</Tip>
|
||||
|
||||
### GPU
|
||||
|
||||
Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example above on a GPU:
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import DDIMPipeline
|
||||
import numpy as np
|
||||
|
||||
model_id = "google/ddpm-cifar10-32"
|
||||
|
||||
# load model and scheduler
|
||||
ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
|
||||
ddim.to("cuda")
|
||||
|
||||
# create a generator for reproducibility
|
||||
generator = torch.Generator(device="cuda").manual_seed(0)
|
||||
|
||||
# run pipeline for just two steps and return numpy tensor
|
||||
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
The result is not the same even though you're using an identical seed because the GPU uses a different random number generator than the CPU.
|
||||
|
||||
To circumvent this problem, 🧨 Diffusers has a [`~diffusers.utils.torch_utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The `randn_tensor` function is used everywhere inside the pipeline, allowing the user to **always** pass a CPU `Generator` even if the pipeline is run on a GPU.
|
||||
|
||||
You'll see the results are much closer now!
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import DDIMPipeline
|
||||
import numpy as np
|
||||
|
||||
model_id = "google/ddpm-cifar10-32"
|
||||
|
||||
# load model and scheduler
|
||||
ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
|
||||
ddim.to("cuda")
|
||||
|
||||
# create a generator for reproducibility; notice you don't place it on the GPU!
|
||||
generator = torch.manual_seed(0)
|
||||
|
||||
# run pipeline for just two steps and return numpy tensor
|
||||
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 If reproducibility is important, we recommend always passing a CPU generator.
|
||||
The performance loss is often neglectable, and you'll generate much more similar
|
||||
values than if the pipeline had been run on a GPU.
|
||||
|
||||
</Tip>
|
||||
|
||||
Finally, for more complex pipelines such as [`UnCLIPPipeline`], these are often extremely
|
||||
susceptible to precision error propagation. Don't expect similar results across
|
||||
different GPU hardware or PyTorch versions. In this case, you'll need to run
|
||||
exactly the same hardware and PyTorch version for full reproducibility.
|
||||
|
||||
## Deterministic algorithms
|
||||
|
||||
You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. However, you should be aware that deterministic algorithms may be slower than nondeterministic ones and you may observe a decrease in performance. But if reproducibility is important to you, then this is the way to go!
|
||||
|
||||
Nondeterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment variable [`CUBLAS_WORKSPACE_CONFIG`](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime.
|
||||
|
||||
PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Lastly, pass `True` to [`torch.use_deterministic_algorithms`](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html) to enable deterministic algorithms.
|
||||
|
||||
```py
|
||||
import os
|
||||
import torch
|
||||
|
||||
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
|
||||
|
||||
torch.backends.cudnn.benchmark = False
|
||||
torch.use_deterministic_algorithms(True)
|
||||
```
|
||||
|
||||
Now when you run the same pipeline twice, you'll get identical results.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DDIMScheduler, StableDiffusionPipeline
|
||||
|
||||
model_id = "runwayml/stable-diffusion-v1-5"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True).to("cuda")
|
||||
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||
g = torch.Generator(device="cuda")
|
||||
|
||||
prompt = "A bear is playing a guitar on Times Square"
|
||||
|
||||
g.manual_seed(0)
|
||||
result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
|
||||
|
||||
g.manual_seed(0)
|
||||
result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
|
||||
|
||||
print("L_inf dist =", abs(result1 - result2).max())
|
||||
"L_inf dist = tensor(0., device='cuda:0')"
|
||||
```
|
||||
@@ -10,72 +10,179 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Improve image quality with deterministic generation
|
||||
# Reproducible pipelines
|
||||
|
||||
[[open-in-colab]]
|
||||
Diffusion models are inherently random which is what allows it to generate different outputs every time it is run. But there are certain times when you want to generate the same output every time, like when you're testing, replicating results, and even [improving image quality](#deterministic-batch-generation). While you can't expect to get identical results across platforms, you can expect reproducible results across releases and platforms within a certain tolerance range (though even this may vary).
|
||||
|
||||
A common way to improve the quality of generated images is with *deterministic batch generation*, generate a batch of images and select one image to improve with a more detailed prompt in a second round of inference. The key is to pass a list of [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html#generator)'s to the pipeline for batched image generation, and tie each `Generator` to a seed so you can reuse it for an image.
|
||||
This guide will show you how to control randomness for deterministic generation on a CPU and GPU.
|
||||
|
||||
Let's use [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) for example, and generate several versions of the following prompt:
|
||||
> [!TIP]
|
||||
> We strongly recommend reading PyTorch's [statement about reproducibility](https://pytorch.org/docs/stable/notes/randomness.html):
|
||||
>
|
||||
> "Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds."
|
||||
|
||||
```py
|
||||
prompt = "Labrador in the style of Vermeer"
|
||||
```
|
||||
## Control randomness
|
||||
|
||||
Instantiate a pipeline with [`DiffusionPipeline.from_pretrained`] and place it on a GPU (if available):
|
||||
During inference, pipelines rely heavily on random sampling operations which include creating the
|
||||
Gaussian noise tensors to denoise and adding noise to the scheduling step.
|
||||
|
||||
Take a look at the tensor values in the [`DDIMPipeline`] after two inference steps.
|
||||
|
||||
```python
|
||||
from diffusers import DDIMPipeline
|
||||
import numpy as np
|
||||
|
||||
ddim = DDIMPipeline.from_pretrained( "google/ddpm-cifar10-32", use_safetensors=True)
|
||||
image = ddim(num_inference_steps=2, output_type="np").images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
Running the code above prints one value, but if you run it again you get a different value.
|
||||
|
||||
Each time the pipeline is run, [torch.randn](https://pytorch.org/docs/stable/generated/torch.randn.html) uses a different random seed to create the Gaussian noise tensors. This leads to a different result each time it is run and enables the diffusion pipeline to generate a different random image each time.
|
||||
|
||||
But if you need to reliably generate the same image, that depends on whether you're running the pipeline on a CPU or GPU.
|
||||
|
||||
> [!TIP]
|
||||
> It might seem unintuitive to pass `Generator` objects to a pipeline instead of the integer value representing the seed. However, this is the recommended design when working with probabilistic models in PyTorch because a `Generator` is a *random state* that can be passed to multiple pipelines in a sequence. As soon as the `Generator` is consumed, the *state* is changed in place which means even if you passed the same `Generator` to a different pipeline, it won't produce the same result because the state is already changed.
|
||||
|
||||
<hfoptions id="hardware">
|
||||
<hfoption id="CPU">
|
||||
|
||||
To generate reproducible results on a CPU, you'll need to use a PyTorch [Generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed. Now when you run the code, it always prints a value of `1491.1711` because the `Generator` object with the seed is passed to all the random functions in the pipeline. You should get a similar, if not the same, result on whatever hardware and PyTorch version you're using.
|
||||
|
||||
```python
|
||||
import torch
|
||||
import numpy as np
|
||||
from diffusers import DDIMPipeline
|
||||
|
||||
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="GPU">
|
||||
|
||||
Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example from the CPU example, you'll get a different result even though the seed is identical. This is because the GPU uses a different random number generator than the CPU.
|
||||
|
||||
```python
|
||||
import torch
|
||||
import numpy as np
|
||||
from diffusers import DDIMPipeline
|
||||
|
||||
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
ddim.to("cuda")
|
||||
generator = torch.Generator(device="cuda").manual_seed(0)
|
||||
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
To avoid this issue, Diffusers has a [`~utils.torch_utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The [`~utils.torch_utils.randn_tensor`] function is used everywhere inside the pipeline. Now you can call [torch.manual_seed](https://pytorch.org/docs/stable/generated/torch.manual_seed.html) which automatically creates a CPU `Generator` that can be passed to the pipeline even if it is being run on a GPU.
|
||||
|
||||
```python
|
||||
import torch
|
||||
import numpy as np
|
||||
from diffusers import DDIMPipeline
|
||||
|
||||
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
ddim.to("cuda")
|
||||
generator = torch.manual_seed(0)
|
||||
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> If reproducibility is important to your use case, we recommend always passing a CPU `Generator`. The performance loss is often negligible and you'll generate more similar values than if the pipeline had been run on a GPU.
|
||||
|
||||
Finally, more complex pipelines such as [`UnCLIPPipeline`], are often extremely
|
||||
susceptible to precision error propagation. You'll need to use
|
||||
exactly the same hardware and PyTorch version for full reproducibility.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Deterministic algorithms
|
||||
|
||||
You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. The downside is that deterministic algorithms may be slower than non-deterministic ones and you may observe a decrease in performance.
|
||||
|
||||
Non-deterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment variable [CUBLAS_WORKSPACE_CONFIG](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime.
|
||||
|
||||
PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Set Diffusers [enable_full_determinism](https://github.com/huggingface/diffusers/blob/142f353e1c638ff1d20bd798402b68f72c1ebbdd/src/diffusers/utils/testing_utils.py#L861) to enable deterministic algorithms.
|
||||
|
||||
```py
|
||||
enable_full_determinism()
|
||||
```
|
||||
|
||||
Now when you run the same pipeline twice, you'll get identical results.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DDIMScheduler, StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True).to("cuda")
|
||||
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||
g = torch.Generator(device="cuda")
|
||||
|
||||
prompt = "A bear is playing a guitar on Times Square"
|
||||
|
||||
g.manual_seed(0)
|
||||
result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
|
||||
|
||||
g.manual_seed(0)
|
||||
result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
|
||||
|
||||
print("L_inf dist =", abs(result1 - result2).max())
|
||||
"L_inf dist = tensor(0., device='cuda:0')"
|
||||
```
|
||||
|
||||
## Deterministic batch generation
|
||||
|
||||
A practical application of creating reproducible pipelines is *deterministic batch generation*. You generate a batch of images and select one image to improve with a more detailed prompt. The main idea is to pass a list of [Generator's](https://pytorch.org/docs/stable/generated/torch.Generator.html) to the pipeline and tie each `Generator` to a seed so you can reuse it.
|
||||
|
||||
Let's use the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint and generate a batch of images.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import make_image_grid
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
pipeline = pipeline.to("cuda")
|
||||
```
|
||||
|
||||
Now, define four different `Generator`s and assign each `Generator` a seed (`0` to `3`) so you can reuse a `Generator` later for a specific image:
|
||||
Define four different `Generator`s and assign each `Generator` a seed (`0` to `3`). Then generate a batch of images and pick one to iterate on.
|
||||
|
||||
> [!WARNING]
|
||||
> Use a list comprehension that iterates over the batch size specified in `range()` to create a unique `Generator` object for each image in the batch. If you multiply the `Generator` by the batch size integer, it only creates *one* `Generator` object that is used sequentially for each image in the batch.
|
||||
>
|
||||
> ```py
|
||||
> [torch.Generator().manual_seed(seed)] * 4
|
||||
> ```
|
||||
|
||||
```python
|
||||
generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(4)]
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
To create a batched seed, you should use a list comprehension that iterates over the length specified in `range()`. This creates a unique `Generator` object for each image in the batch. If you only multiply the `Generator` by the batch size, this only creates one `Generator` object that is used sequentially for each image in the batch.
|
||||
|
||||
For example, if you want to use the same seed to create 4 identical images:
|
||||
|
||||
```py
|
||||
❌ [torch.Generator().manual_seed(seed)] * 4
|
||||
|
||||
✅ [torch.Generator().manual_seed(seed) for _ in range(4)]
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
Generate the images and have a look:
|
||||
|
||||
```python
|
||||
images = pipe(prompt, generator=generator, num_images_per_prompt=4).images
|
||||
prompt = "Labrador in the style of Vermeer"
|
||||
images = pipeline(prompt, generator=generator, num_images_per_prompt=4).images[0]
|
||||
make_image_grid(images, rows=2, cols=2)
|
||||
```
|
||||
|
||||

|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds.jpg"/>
|
||||
</div>
|
||||
|
||||
In this example, you'll improve upon the first image - but in reality, you can use any image you want (even the image with double sets of eyes!). The first image used the `Generator` with seed `0`, so you'll reuse that `Generator` for the second round of inference. To improve the quality of the image, add some additional text to the prompt:
|
||||
Let's improve the first image (you can choose any image you want) which corresponds to the `Generator` with seed `0`. Add some additional text to your prompt and then make sure you reuse the same `Generator` with seed `0`. All the generated images should resemble the first image.
|
||||
|
||||
```python
|
||||
prompt = [prompt + t for t in [", highly realistic", ", artsy", ", trending", ", colorful"]]
|
||||
generator = [torch.Generator(device="cuda").manual_seed(0) for i in range(4)]
|
||||
```
|
||||
|
||||
Create four generators with seed `0`, and generate another batch of images, all of which should look like the first image from the previous round!
|
||||
|
||||
```python
|
||||
images = pipe(prompt, generator=generator).images
|
||||
images = pipeline(prompt, generator=generator).images
|
||||
make_image_grid(images, rows=2, cols=2)
|
||||
```
|
||||
|
||||

|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds_2.jpg"/>
|
||||
</div>
|
||||
|
||||
@@ -10,57 +10,27 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Schedulers
|
||||
# Load schedulers and models
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Diffusion pipelines are inherently a collection of diffusion models and schedulers that are partly independent from each other. This means that one is able to switch out parts of the pipeline to better customize
|
||||
a pipeline to one's use case. The best example of this is the [Schedulers](../api/schedulers/overview).
|
||||
Diffusion pipelines are a collection of interchangeable schedulers and models that can be mixed and matched to tailor a pipeline to a specific use case. The scheduler encapsulates the entire denoising process such as the number of denoising steps and the algorithm for finding the denoised sample. A scheduler is not parameterized or trained so they don't take very much memory. The model is usually only concerned with the forward pass of going from a noisy input to a less noisy sample.
|
||||
|
||||
Whereas diffusion models usually simply define the forward pass from noise to a less noisy sample,
|
||||
schedulers define the whole denoising process, *i.e.*:
|
||||
- How many denoising steps?
|
||||
- Stochastic or deterministic?
|
||||
- What algorithm to use to find the denoised sample?
|
||||
This guide will show you how to load schedulers and models to customize a pipeline. You'll use the [runwayml/stable-diffusion-v1-5](https://hf.co/runwayml/stable-diffusion-v1-5) checkpoint throughout this guide, so let's load it first.
|
||||
|
||||
They can be quite complex and often define a trade-off between **denoising speed** and **denoising quality**.
|
||||
It is extremely difficult to measure quantitatively which scheduler works best for a given diffusion pipeline, so it is often recommended to simply try out which works best.
|
||||
|
||||
The following paragraphs show how to do so with the 🧨 Diffusers library.
|
||||
|
||||
## Load pipeline
|
||||
|
||||
Let's start by loading the [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) model in the [`DiffusionPipeline`]:
|
||||
|
||||
```python
|
||||
from huggingface_hub import login
|
||||
from diffusers import DiffusionPipeline
|
||||
```py
|
||||
import torch
|
||||
|
||||
login()
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Next, we move it to GPU:
|
||||
You can see what scheduler this pipeline uses with the `pipeline.scheduler` attribute.
|
||||
|
||||
```python
|
||||
pipeline.to("cuda")
|
||||
```
|
||||
|
||||
## Access the scheduler
|
||||
|
||||
The scheduler is always one of the components of the pipeline and is usually called `"scheduler"`.
|
||||
So it can be accessed via the `"scheduler"` property.
|
||||
|
||||
```python
|
||||
```py
|
||||
pipeline.scheduler
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
PNDMScheduler {
|
||||
"_class_name": "PNDMScheduler",
|
||||
"_diffusers_version": "0.21.4",
|
||||
@@ -77,235 +47,156 @@ PNDMScheduler {
|
||||
}
|
||||
```
|
||||
|
||||
We can see that the scheduler is of type [`PNDMScheduler`].
|
||||
Cool, now let's compare the scheduler in its performance to other schedulers.
|
||||
First we define a prompt on which we will test all the different schedulers:
|
||||
## Load a scheduler
|
||||
|
||||
```python
|
||||
prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition."
|
||||
```
|
||||
Schedulers are defined by a configuration file that can be used by a variety of schedulers. Load a scheduler with the [`SchedulerMixin.from_pretrained`] method, and specify the `subfolder` parameter to load the configuration file into the correct subfolder of the pipeline repository.
|
||||
|
||||
Next, we create a generator from a random seed that will ensure that we can generate similar images as well as run the pipeline:
|
||||
For example, to load the [`DDIMScheduler`]:
|
||||
|
||||
```python
|
||||
generator = torch.Generator(device="cuda").manual_seed(8)
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<p align="center">
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_pndm.png" width="400"/>
|
||||
<br>
|
||||
</p>
|
||||
|
||||
|
||||
## Changing the scheduler
|
||||
|
||||
Now we show how easy it is to change the scheduler of a pipeline. Every scheduler has a property [`~SchedulerMixin.compatibles`]
|
||||
which defines all compatible schedulers. You can take a look at all available, compatible schedulers for the Stable Diffusion pipeline as follows.
|
||||
|
||||
```python
|
||||
pipeline.scheduler.compatibles
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
[diffusers.utils.dummy_torch_and_torchsde_objects.DPMSolverSDEScheduler,
|
||||
diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler,
|
||||
diffusers.schedulers.scheduling_lms_discrete.LMSDiscreteScheduler,
|
||||
diffusers.schedulers.scheduling_ddim.DDIMScheduler,
|
||||
diffusers.schedulers.scheduling_ddpm.DDPMScheduler,
|
||||
diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler,
|
||||
diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler,
|
||||
diffusers.schedulers.scheduling_deis_multistep.DEISMultistepScheduler,
|
||||
diffusers.schedulers.scheduling_pndm.PNDMScheduler,
|
||||
diffusers.schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteScheduler,
|
||||
diffusers.schedulers.scheduling_unipc_multistep.UniPCMultistepScheduler,
|
||||
diffusers.schedulers.scheduling_k_dpm_2_discrete.KDPM2DiscreteScheduler,
|
||||
diffusers.schedulers.scheduling_dpmsolver_singlestep.DPMSolverSinglestepScheduler,
|
||||
diffusers.schedulers.scheduling_k_dpm_2_ancestral_discrete.KDPM2AncestralDiscreteScheduler]
|
||||
```
|
||||
|
||||
Cool, lots of schedulers to look at. Feel free to have a look at their respective class definitions:
|
||||
|
||||
- [`EulerDiscreteScheduler`],
|
||||
- [`LMSDiscreteScheduler`],
|
||||
- [`DDIMScheduler`],
|
||||
- [`DDPMScheduler`],
|
||||
- [`HeunDiscreteScheduler`],
|
||||
- [`DPMSolverMultistepScheduler`],
|
||||
- [`DEISMultistepScheduler`],
|
||||
- [`PNDMScheduler`],
|
||||
- [`EulerAncestralDiscreteScheduler`],
|
||||
- [`UniPCMultistepScheduler`],
|
||||
- [`KDPM2DiscreteScheduler`],
|
||||
- [`DPMSolverSinglestepScheduler`],
|
||||
- [`KDPM2AncestralDiscreteScheduler`].
|
||||
|
||||
We will now compare the input prompt with all other schedulers. To change the scheduler of the pipeline you can make use of the
|
||||
convenient [`~ConfigMixin.config`] property in combination with the [`~ConfigMixin.from_config`] function.
|
||||
|
||||
```python
|
||||
pipeline.scheduler.config
|
||||
```
|
||||
|
||||
returns a dictionary of the configuration of the scheduler:
|
||||
|
||||
**Output**:
|
||||
```py
|
||||
FrozenDict([('num_train_timesteps', 1000),
|
||||
('beta_start', 0.00085),
|
||||
('beta_end', 0.012),
|
||||
('beta_schedule', 'scaled_linear'),
|
||||
('trained_betas', None),
|
||||
('skip_prk_steps', True),
|
||||
('set_alpha_to_one', False),
|
||||
('prediction_type', 'epsilon'),
|
||||
('timestep_spacing', 'leading'),
|
||||
('steps_offset', 1),
|
||||
('_use_default_values', ['timestep_spacing', 'prediction_type']),
|
||||
('_class_name', 'PNDMScheduler'),
|
||||
('_diffusers_version', '0.21.4'),
|
||||
('clip_sample', False)])
|
||||
from diffusers import DDIMScheduler, DiffusionPipeline
|
||||
|
||||
ddim = DDIMScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
|
||||
```
|
||||
|
||||
This configuration can then be used to instantiate a scheduler
|
||||
of a different class that is compatible with the pipeline. Here,
|
||||
we change the scheduler to the [`DDIMScheduler`].
|
||||
Then you can pass the newly loaded scheduler to the pipeline.
|
||||
|
||||
```python
|
||||
from diffusers import DDIMScheduler
|
||||
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", scheduler=ddim, torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Cool, now we can run the pipeline again to compare the generation quality.
|
||||
|
||||
```python
|
||||
generator = torch.Generator(device="cuda").manual_seed(8)
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<p align="center">
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_ddim.png" width="400"/>
|
||||
<br>
|
||||
</p>
|
||||
|
||||
If you are a JAX/Flax user, please check [this section](#changing-the-scheduler-in-flax) instead.
|
||||
|
||||
## Compare schedulers
|
||||
|
||||
So far we have tried running the stable diffusion pipeline with two schedulers: [`PNDMScheduler`] and [`DDIMScheduler`].
|
||||
A number of better schedulers have been released that can be run with much fewer steps; let's compare them here:
|
||||
Schedulers have their own unique strengths and weaknesses, making it difficult to quantitatively compare which scheduler works best for a pipeline. You typically have to make a trade-off between denoising speed and denoising quality. We recommend trying out different schedulers to find one that works best for your use case. Call the `pipeline.scheduler.compatibles` attribute to see what schedulers are compatible with a pipeline.
|
||||
|
||||
[`LMSDiscreteScheduler`] usually leads to better results:
|
||||
Let's compare the [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`], and the [`DPMSolverMultistepScheduler`] on the following prompt and seed.
|
||||
|
||||
```python
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
|
||||
prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition."
|
||||
generator = torch.Generator(device="cuda").manual_seed(8)
|
||||
```
|
||||
|
||||
To change the pipelines scheduler, use the [`~ConfigMixin.from_config`] method to load a different scheduler's `pipeline.scheduler.config` into the pipeline.
|
||||
|
||||
<hfoptions id="schedulers">
|
||||
<hfoption id="LMSDiscreteScheduler">
|
||||
|
||||
[`LMSDiscreteScheduler`] typically generates higher quality images than the default scheduler.
|
||||
|
||||
```py
|
||||
from diffusers import LMSDiscreteScheduler
|
||||
|
||||
pipeline.scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config)
|
||||
|
||||
generator = torch.Generator(device="cuda").manual_seed(8)
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<p align="center">
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_lms.png" width="400"/>
|
||||
<br>
|
||||
</p>
|
||||
</hfoption>
|
||||
<hfoption id="EulerDiscreteScheduler">
|
||||
|
||||
[`EulerDiscreteScheduler`] can generate higher quality images in just 30 steps.
|
||||
|
||||
[`EulerDiscreteScheduler`] and [`EulerAncestralDiscreteScheduler`] can generate high quality results with as little as 30 steps.
|
||||
|
||||
```python
|
||||
```py
|
||||
from diffusers import EulerDiscreteScheduler
|
||||
|
||||
pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
|
||||
|
||||
generator = torch.Generator(device="cuda").manual_seed(8)
|
||||
image = pipeline(prompt, generator=generator, num_inference_steps=30).images[0]
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<p align="center">
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_discrete.png" width="400"/>
|
||||
<br>
|
||||
</p>
|
||||
</hfoption>
|
||||
<hfoption id="EulerAncestralDiscreteScheduler">
|
||||
|
||||
[`EulerAncestralDiscreteScheduler`] can generate higher quality images in just 30 steps.
|
||||
|
||||
and:
|
||||
|
||||
```python
|
||||
```py
|
||||
from diffusers import EulerAncestralDiscreteScheduler
|
||||
|
||||
pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
|
||||
|
||||
generator = torch.Generator(device="cuda").manual_seed(8)
|
||||
image = pipeline(prompt, generator=generator, num_inference_steps=30).images[0]
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<p align="center">
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_ancestral.png" width="400"/>
|
||||
<br>
|
||||
</p>
|
||||
</hfoption>
|
||||
<hfoption id="DPMSolverMultistepScheduler">
|
||||
|
||||
[`DPMSolverMultistepScheduler`] provides a balance between speed and quality and can generate higher quality images in just 20 steps.
|
||||
|
||||
[`DPMSolverMultistepScheduler`] gives a reasonable speed/quality trade-off and can be run with as little as 20 steps.
|
||||
|
||||
```python
|
||||
```py
|
||||
from diffusers import DPMSolverMultistepScheduler
|
||||
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
|
||||
|
||||
generator = torch.Generator(device="cuda").manual_seed(8)
|
||||
image = pipeline(prompt, generator=generator, num_inference_steps=20).images[0]
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<p align="center">
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_dpm.png" width="400"/>
|
||||
<br>
|
||||
</p>
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
As you can see, most images look very similar and are arguably of very similar quality. It often really depends on the specific use case which scheduler to choose. A good approach is always to run multiple different
|
||||
schedulers to compare results.
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_lms.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">LMSDiscreteScheduler</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_discrete.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">EulerDiscreteScheduler</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_ancestral.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">EulerAncestralDiscreteScheduler</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_dpm.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">DPMSolverMultistepScheduler</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Changing the Scheduler in Flax
|
||||
Most images look very similar and are comparable in quality. Again, it often comes down to your specific use case so a good approach is to run multiple different schedulers and compare the results.
|
||||
|
||||
If you are a JAX/Flax user, you can also change the default pipeline scheduler. This is a complete example of how to run inference using the Flax Stable Diffusion pipeline and the super-fast [DPM-Solver++ scheduler](../api/schedulers/multistep_dpm_solver):
|
||||
### Flax schedulers
|
||||
|
||||
```Python
|
||||
To compare Flax schedulers, you need to additionally load the scheduler state into the model parameters. For example, let's change the default scheduler in [`FlaxStableDiffusionPipeline`] to use the super fast [`FlaxDPMSolverMultistepScheduler`].
|
||||
|
||||
> [!WARNING]
|
||||
> The [`FlaxLMSDiscreteScheduler`] and [`FlaxDDPMScheduler`] are not compatible with the [`FlaxStableDiffusionPipeline`] yet.
|
||||
|
||||
```py
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
|
||||
from diffusers import FlaxStableDiffusionPipeline, FlaxDPMSolverMultistepScheduler
|
||||
|
||||
model_id = "runwayml/stable-diffusion-v1-5"
|
||||
scheduler, scheduler_state = FlaxDPMSolverMultistepScheduler.from_pretrained(
|
||||
model_id,
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
subfolder="scheduler"
|
||||
)
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
|
||||
model_id,
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
scheduler=scheduler,
|
||||
revision="bf16",
|
||||
dtype=jax.numpy.bfloat16,
|
||||
)
|
||||
params["scheduler"] = scheduler_state
|
||||
```
|
||||
|
||||
Then you can take advantage of Flax's compatibility with TPUs to generate a number of images in parallel. You'll need to make a copy of the model parameters for each available device and then split the inputs across them to generate your desired number of images.
|
||||
|
||||
```py
|
||||
# Generate 1 image per parallel device (8 on TPUv2-8 or TPUv3-8)
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition."
|
||||
num_samples = jax.device_count()
|
||||
prompt_ids = pipeline.prepare_inputs([prompt] * num_samples)
|
||||
|
||||
@@ -321,11 +212,33 @@ images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
## Models
|
||||
|
||||
The following Flax schedulers are _not yet compatible_ with the Flax Stable Diffusion Pipeline:
|
||||
Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them.
|
||||
|
||||
- `FlaxLMSDiscreteScheduler`
|
||||
- `FlaxDDPMScheduler`
|
||||
Models can be loaded from a subfolder with the `subfolder` argument. For example, the model weights for [runwayml/stable-diffusion-v1-5](https://hf.co/runwayml/stable-diffusion-v1-5) are stored in the [unet](https://hf.co/runwayml/stable-diffusion-v1-5/tree/main/unet) subfolder.
|
||||
|
||||
</Tip>
|
||||
```python
|
||||
from diffusers import UNet2DConditionModel
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet", use_safetensors=True)
|
||||
```
|
||||
|
||||
They can also be directly loaded from a [repository](https://huggingface.co/google/ddpm-cifar10-32/tree/main).
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DModel
|
||||
|
||||
unet = UNet2DModel.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
```
|
||||
|
||||
To load and save model variants, specify the `variant` argument in [`ModelMixin.from_pretrained`] and [`ModelMixin.save_pretrained`].
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DConditionModel
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", subfolder="unet", variant="non_ema", use_safetensors=True
|
||||
)
|
||||
unet.save_pretrained("./local-unet", variant="non_ema")
|
||||
```
|
||||
|
||||
@@ -21,7 +21,7 @@ This guide will show you how to use SVD to generate short videos from images.
|
||||
Before you begin, make sure you have the following libraries installed:
|
||||
|
||||
```py
|
||||
!pip install -q -U diffusers transformers accelerate
|
||||
!pip install -q -U diffusers transformers accelerate
|
||||
```
|
||||
|
||||
The are two variants of this model, [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt). The SVD checkpoint is trained to generate 14 frames and the SVD-XT checkpoint is further finetuned to generate 25 frames.
|
||||
@@ -86,7 +86,7 @@ Video generation is very memory intensive because you're essentially generating
|
||||
+ frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
|
||||
```
|
||||
|
||||
Using all these tricks togethere should lower the memory requirement to less than 8GB VRAM.
|
||||
Using all these tricks together should lower the memory requirement to less than 8GB VRAM.
|
||||
|
||||
## Micro-conditioning
|
||||
|
||||
|
||||
219
docs/source/en/using-diffusers/t2i_adapter.md
Normal file
219
docs/source/en/using-diffusers/t2i_adapter.md
Normal file
@@ -0,0 +1,219 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# T2I-Adapter
|
||||
|
||||
[T2I-Adapter](https://hf.co/papers/2302.08453) is a lightweight adapter for controlling and providing more accurate
|
||||
structure guidance for text-to-image models. It works by learning an alignment between the internal knowledge of the
|
||||
text-to-image model and an external control signal, such as edge detection or depth estimation.
|
||||
|
||||
The T2I-Adapter design is simple, the condition is passed to four feature extraction blocks and three downsample
|
||||
blocks. This makes it fast and easy to train different adapters for different conditions which can be plugged into the
|
||||
text-to-image model. T2I-Adapter is similar to [ControlNet](controlnet) except it is smaller (~77M parameters) and
|
||||
faster because it only runs once during the diffusion process. The downside is that performance may be slightly worse
|
||||
than ControlNet.
|
||||
|
||||
This guide will show you how to use T2I-Adapter with different Stable Diffusion models and how you can compose multiple
|
||||
T2I-Adapters to impose more than one condition.
|
||||
|
||||
> [!TIP]
|
||||
> There are several T2I-Adapters available for different conditions, such as color palette, depth, sketch, pose, and
|
||||
> segmentation. Check out the [TencentARC](https://hf.co/TencentARC) repository to try them out!
|
||||
|
||||
Before you begin, make sure you have the following libraries installed.
|
||||
|
||||
```py
|
||||
# uncomment to install the necessary libraries in Colab
|
||||
#!pip install -q diffusers accelerate controlnet-aux==0.0.7
|
||||
```
|
||||
|
||||
## Text-to-image
|
||||
|
||||
Text-to-image models rely on a prompt to generate an image, but sometimes, text alone may not be enough to provide more
|
||||
accurate structural guidance. T2I-Adapter allows you to provide an additional control image to guide the generation
|
||||
process. For example, you can provide a canny image (a white outline of an image on a black background) to guide the
|
||||
model to generate an image with a similar structure.
|
||||
|
||||
<hfoptions id="stablediffusion">
|
||||
<hfoption id="Stable Diffusion 1.5">
|
||||
|
||||
Create a canny image with the [opencv-library](https://github.com/opencv/opencv-python).
|
||||
|
||||
```py
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from diffusers.utils import load_image
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
|
||||
image = np.array(image)
|
||||
|
||||
low_threshold = 100
|
||||
high_threshold = 200
|
||||
|
||||
image = cv2.Canny(image, low_threshold, high_threshold)
|
||||
image = Image.fromarray(image)
|
||||
```
|
||||
|
||||
Now load a T2I-Adapter conditioned on [canny images](https://hf.co/TencentARC/t2iadapter_canny_sd15v2) and pass it to
|
||||
the [`StableDiffusionAdapterPipeline`].
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
|
||||
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_canny_sd15v2", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionAdapterPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
adapter=adapter,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
```
|
||||
|
||||
Finally, pass your prompt and control image to the pipeline.
|
||||
|
||||
```py
|
||||
generator = torch.Generator("cuda").manual_seed(0)
|
||||
|
||||
image = pipeline(
|
||||
prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed",
|
||||
image=image,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-sd1.5.png"/>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Stable Diffusion XL">
|
||||
|
||||
Create a canny image with the [controlnet-aux](https://github.com/huggingface/controlnet_aux) library.
|
||||
|
||||
```py
|
||||
from controlnet_aux.canny import CannyDetector
|
||||
from diffusers.utils import load_image
|
||||
|
||||
canny_detector = CannyDetector()
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
|
||||
image = canny_detector(image, detect_resolution=384, image_resolution=1024)
|
||||
```
|
||||
|
||||
Now load a T2I-Adapter conditioned on [canny images](https://hf.co/TencentARC/t2i-adapter-canny-sdxl-1.0) and pass it
|
||||
to the [`StableDiffusionXLAdapterPipeline`].
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, EulerAncestralDiscreteScheduler, AutoencoderKL
|
||||
|
||||
scheduler = EulerAncestralDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionXLAdapterPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
adapter=adapter,
|
||||
vae=vae,
|
||||
scheduler=scheduler,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
```
|
||||
|
||||
Finally, pass your prompt and control image to the pipeline.
|
||||
|
||||
```py
|
||||
generator = torch.Generator("cuda").manual_seed(0)
|
||||
|
||||
image = pipeline(
|
||||
prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed",
|
||||
image=image,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-sdxl.png"/>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## MultiAdapter
|
||||
|
||||
T2I-Adapters are also composable, allowing you to use more than one adapter to impose multiple control conditions on an
|
||||
image. For example, you can use a pose map to provide structural control and a depth map for depth control. This is
|
||||
enabled by the [`MultiAdapter`] class.
|
||||
|
||||
Let's condition a text-to-image model with a pose and depth adapter. Create and place your depth and pose image and in a list.
|
||||
|
||||
```py
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pose_image = load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"
|
||||
)
|
||||
depth_image = load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"
|
||||
)
|
||||
cond = [pose_image, depth_image]
|
||||
prompt = ["Santa Claus walking into an office room with a beautiful city view"]
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">depth image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">pose image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Load the corresponding pose and depth adapters as a list in the [`MultiAdapter`] class.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionAdapterPipeline, MultiAdapter, T2IAdapter
|
||||
|
||||
adapters = MultiAdapter(
|
||||
[
|
||||
T2IAdapter.from_pretrained("TencentARC/t2iadapter_keypose_sd14v1"),
|
||||
T2IAdapter.from_pretrained("TencentARC/t2iadapter_depth_sd14v1"),
|
||||
]
|
||||
)
|
||||
adapters = adapters.to(torch.float16)
|
||||
```
|
||||
|
||||
Finally, load a [`StableDiffusionAdapterPipeline`] with the adapters, and pass your prompt and conditioned images to
|
||||
it. Use the [`adapter_conditioning_scale`] to adjust the weight of each adapter on the image.
|
||||
|
||||
```py
|
||||
pipeline = StableDiffusionAdapterPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
torch_dtype=torch.float16,
|
||||
adapter=adapters,
|
||||
).to("cuda")
|
||||
|
||||
image = pipeline(prompt, cond, adapter_conditioning_scale=[0.7, 0.7]).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-multi.png"/>
|
||||
</div>
|
||||
@@ -10,10 +10,209 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Prompt weighting
|
||||
# Prompt techniques
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Prompts are important because they describe what you want a diffusion model to generate. The best prompts are detailed, specific, and well-structured to help the model realize your vision. But crafting a great prompt takes time and effort and sometimes it may not be enough because language and words can be imprecise. This is where you need to boost your prompt with other techniques, such as prompt enhancing and prompt weighting, to get the results you want.
|
||||
|
||||
This guide will show you how you can use these prompt techniques to generate high-quality images with lower effort and adjust the weight of certain keywords in a prompt.
|
||||
|
||||
## Prompt engineering
|
||||
|
||||
> [!TIP]
|
||||
> This is not an exhaustive guide on prompt engineering, but it will help you understand the necessary parts of a good prompt. We encourage you to continue experimenting with different prompts and combine them in new ways to see what works best. As you write more prompts, you'll develop an intuition for what works and what doesn't!
|
||||
|
||||
New diffusion models do a pretty good job of generating high-quality images from a basic prompt, but it is still important to create a well-written prompt to get the best results. Here are a few tips for writing a good prompt:
|
||||
|
||||
1. What is the image *medium*? Is it a photo, a painting, a 3D illustration, or something else?
|
||||
2. What is the image *subject*? Is it a person, animal, object, or scene?
|
||||
3. What *details* would you like to see in the image? This is where you can get really creative and have a lot of fun experimenting with different words to bring your image to life. For example, what is the lighting like? What is the vibe and aesthetic? What kind of art or illustration style are you looking for? The more specific and precise words you use, the better the model will understand what you want to generate.
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/plain-prompt.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">"A photo of a banana-shaped couch in a living room"</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/detail-prompt.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">"A vibrant yellow banana-shaped couch sits in a cozy living room, its curve cradling a pile of colorful cushions. on the wooden floor, a patterned rug adds a touch of eclectic charm, and a potted plant sits in the corner, reaching towards the sunlight filtering through the windows"</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Prompt enhancing with GPT2
|
||||
|
||||
Prompt enhancing is a technique for quickly improving prompt quality without spending too much effort constructing one. It uses a model like GPT2 pretrained on Stable Diffusion text prompts to automatically enrich a prompt with additional important keywords to generate high-quality images.
|
||||
|
||||
The technique works by curating a list of specific keywords and forcing the model to generate those words to enhance the original prompt. This way, your prompt can be "a cat" and GPT2 can enhance the prompt to "cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain quality sharp focus beautiful detailed intricate stunning amazing epic".
|
||||
|
||||
> [!TIP]
|
||||
> You should also use a [*offset noise*](https://www.crosslabs.org//blog/diffusion-with-offset-noise) LoRA to improve the contrast in bright and dark images and create better lighting overall. This [LoRA](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_offset_example-lora_1.0.safetensors) is available from [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0).
|
||||
|
||||
Start by defining certain styles and a list of words (you can check out a more comprehensive list of [words](https://hf.co/LykosAI/GPT-Prompt-Expansion-Fooocus-v2/blob/main/positive.txt) and [styles](https://github.com/lllyasviel/Fooocus/tree/main/sdxl_styles) used by Fooocus) to enhance a prompt with.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import GenerationConfig, GPT2LMHeadModel, GPT2Tokenizer, LogitsProcessor, LogitsProcessorList
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
styles = {
|
||||
"cinematic": "cinematic film still of {prompt}, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain",
|
||||
"anime": "anime artwork of {prompt}, anime style, key visual, vibrant, studio anime, highly detailed",
|
||||
"photographic": "cinematic photo of {prompt}, 35mm photograph, film, professional, 4k, highly detailed",
|
||||
"comic": "comic of {prompt}, graphic illustration, comic art, graphic novel art, vibrant, highly detailed",
|
||||
"lineart": "line art drawing {prompt}, professional, sleek, modern, minimalist, graphic, line art, vector graphics",
|
||||
"pixelart": " pixel-art {prompt}, low-res, blocky, pixel art style, 8-bit graphics",
|
||||
}
|
||||
|
||||
words = [
|
||||
"aesthetic", "astonishing", "beautiful", "breathtaking", "composition", "contrasted", "epic", "moody", "enhanced",
|
||||
"exceptional", "fascinating", "flawless", "glamorous", "glorious", "illumination", "impressive", "improved",
|
||||
"inspirational", "magnificent", "majestic", "hyperrealistic", "smooth", "sharp", "focus", "stunning", "detailed",
|
||||
"intricate", "dramatic", "high", "quality", "perfect", "light", "ultra", "highly", "radiant", "satisfying",
|
||||
"soothing", "sophisticated", "stylish", "sublime", "terrific", "touching", "timeless", "wonderful", "unbelievable",
|
||||
"elegant", "awesome", "amazing", "dynamic", "trendy",
|
||||
]
|
||||
```
|
||||
|
||||
You may have noticed in the `words` list, there are certain words that can be paired together to create something more meaningful. For example, the words "high" and "quality" can be combined to create "high quality". Let's pair these words together and remove the words that can't be paired.
|
||||
|
||||
```py
|
||||
word_pairs = ["highly detailed", "high quality", "enhanced quality", "perfect composition", "dynamic light"]
|
||||
|
||||
def find_and_order_pairs(s, pairs):
|
||||
words = s.split()
|
||||
found_pairs = []
|
||||
for pair in pairs:
|
||||
pair_words = pair.split()
|
||||
if pair_words[0] in words and pair_words[1] in words:
|
||||
found_pairs.append(pair)
|
||||
words.remove(pair_words[0])
|
||||
words.remove(pair_words[1])
|
||||
|
||||
for word in words[:]:
|
||||
for pair in pairs:
|
||||
if word in pair.split():
|
||||
words.remove(word)
|
||||
break
|
||||
ordered_pairs = ", ".join(found_pairs)
|
||||
remaining_s = ", ".join(words)
|
||||
return ordered_pairs, remaining_s
|
||||
```
|
||||
|
||||
Next, implement a custom [`~transformers.LogitsProcessor`] class that assigns tokens in the `words` list a value of 0 and assigns tokens not in the `words` list a negative value so they aren't picked during generation. This way, generation is biased towards words in the `words` list. After a word from the list is used, it is also assigned a negative value so it isn't picked again.
|
||||
|
||||
```py
|
||||
class CustomLogitsProcessor(LogitsProcessor):
|
||||
def __init__(self, bias):
|
||||
super().__init__()
|
||||
self.bias = bias
|
||||
|
||||
def __call__(self, input_ids, scores):
|
||||
if len(input_ids.shape) == 2:
|
||||
last_token_id = input_ids[0, -1]
|
||||
self.bias[last_token_id] = -1e10
|
||||
return scores + self.bias
|
||||
|
||||
word_ids = [tokenizer.encode(word, add_prefix_space=True)[0] for word in words]
|
||||
bias = torch.full((tokenizer.vocab_size,), -float("Inf")).to("cuda")
|
||||
bias[word_ids] = 0
|
||||
processor = CustomLogitsProcessor(bias)
|
||||
processor_list = LogitsProcessorList([processor])
|
||||
```
|
||||
|
||||
Combine the prompt and the `cinematic` style prompt defined in the `styles` dictionary earlier.
|
||||
|
||||
```py
|
||||
prompt = "a cat basking in the sun on a roof in Turkey"
|
||||
style = "cinematic"
|
||||
|
||||
prompt = styles[style].format(prompt=prompt)
|
||||
prompt
|
||||
"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain"
|
||||
```
|
||||
|
||||
Load a GPT2 tokenizer and model from the [Gustavosta/MagicPrompt-Stable-Diffusion](https://huggingface.co/Gustavosta/MagicPrompt-Stable-Diffusion) checkpoint (this specific checkpoint is trained to generate prompts) to enhance the prompt.
|
||||
|
||||
```py
|
||||
tokenizer = GPT2Tokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion")
|
||||
model = GPT2LMHeadModel.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion", torch_dtype=torch.float16).to(
|
||||
"cuda"
|
||||
)
|
||||
model.eval()
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
||||
token_count = inputs["input_ids"].shape[1]
|
||||
max_new_tokens = 50 - token_count
|
||||
|
||||
generation_config = GenerationConfig(
|
||||
penalty_alpha=0.7,
|
||||
top_k=50,
|
||||
eos_token_id=model.config.eos_token_id,
|
||||
pad_token_id=model.config.eos_token_id,
|
||||
pad_token=model.config.pad_token_id,
|
||||
do_sample=True,
|
||||
)
|
||||
|
||||
with torch.no_grad():
|
||||
generated_ids = model.generate(
|
||||
input_ids=inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
max_new_tokens=max_new_tokens,
|
||||
generation_config=generation_config,
|
||||
logits_processor=proccesor_list,
|
||||
)
|
||||
```
|
||||
|
||||
Then you can combine the input prompt and the generated prompt. Feel free to take a look at what the generated prompt (`generated_part`) is, the word pairs that were found (`pairs`), and the remaining words (`words`). This is all packed together in the `enhanced_prompt`.
|
||||
|
||||
```py
|
||||
output_tokens = [tokenizer.decode(generated_id, skip_special_tokens=True) for generated_id in generated_ids]
|
||||
input_part, generated_part = output_tokens[0][: len(prompt)], output_tokens[0][len(prompt) :]
|
||||
pairs, words = find_and_order_pairs(generated_part, word_pairs)
|
||||
formatted_generated_part = pairs + ", " + words
|
||||
enhanced_prompt = input_part + ", " + formatted_generated_part
|
||||
enhanced_prompt
|
||||
["cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain quality sharp focus beautiful detailed intricate stunning amazing epic"]
|
||||
```
|
||||
|
||||
Finally, load a pipeline and the offset noise LoRA with a *low weight* to generate an image with the enhanced prompt.
|
||||
|
||||
```py
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
|
||||
pipeline.load_lora_weights(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
weight_name="sd_xl_offset_example-lora_1.0.safetensors",
|
||||
adapter_name="offset",
|
||||
)
|
||||
pipeline.set_adapters(["offset"], adapter_weights=[0.2])
|
||||
|
||||
image = pipeline(
|
||||
enhanced_prompt,
|
||||
width=1152,
|
||||
height=896,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=25,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">"a cat basking in the sun on a roof in Turkey"</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/enhanced-prompt.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">"cinematic film still of a cat basking in the sun on a roof in Turkey, highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain"</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Prompt weighting
|
||||
|
||||
Prompt weighting provides a way to emphasize or de-emphasize certain parts of a prompt, allowing for more control over the generated image. A prompt can include several concepts, which gets turned into contextualized text embeddings. The embeddings are used by the model to condition its cross-attention layers to generate an image (read the Stable Diffusion [blog post](https://huggingface.co/blog/stable_diffusion) to learn more about how it works).
|
||||
|
||||
Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt-weighted embeddings is to use [Compel](https://github.com/damian0815/compel), a text prompt-weighting and blending library. Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [`prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [`negative_prompt_embeds`](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
|
||||
@@ -55,7 +254,7 @@ image
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/compel/forest_0.png"/>
|
||||
</div>
|
||||
|
||||
## Weighting
|
||||
### Weighting
|
||||
|
||||
You'll notice there is no "ball" in the image! Let's use compel to upweight the concept of "ball" in the prompt. Create a [`Compel`](https://github.com/damian0815/compel/blob/main/doc/compel.md#compel-objects) object, and pass it a tokenizer and text encoder:
|
||||
|
||||
@@ -123,7 +322,7 @@ image
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-pos-neg.png"/>
|
||||
</div>
|
||||
|
||||
## Blending
|
||||
### Blending
|
||||
|
||||
You can also create a weighted *blend* of prompts by adding `.blend()` to a list of prompts and passing it some weights. Your blend may not always produce the result you expect because it breaks some assumptions about how the text encoder functions, so just have fun and experiment with it!
|
||||
|
||||
@@ -139,7 +338,7 @@ image
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-blend.png"/>
|
||||
</div>
|
||||
|
||||
## Conjunction
|
||||
### Conjunction
|
||||
|
||||
A conjunction diffuses each prompt independently and concatenates their results by their weighted sum. Add `.and()` to the end of a list of prompts to create a conjunction:
|
||||
|
||||
@@ -155,7 +354,7 @@ image
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-conj.png"/>
|
||||
</div>
|
||||
|
||||
## Textual inversion
|
||||
### Textual inversion
|
||||
|
||||
[Textual inversion](../training/text_inversion) is a technique for learning a specific concept from some images which you can use to generate new images conditioned on that concept.
|
||||
|
||||
@@ -195,7 +394,7 @@ image
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-text-inversion.png"/>
|
||||
</div>
|
||||
|
||||
## DreamBooth
|
||||
### DreamBooth
|
||||
|
||||
[DreamBooth](../training/dreambooth) is a technique for generating contextualized images of a subject given just a few images of the subject to train on. It is similar to textual inversion, but DreamBooth trains the full model whereas textual inversion only fine-tunes the text embeddings. This means you should use [`~DiffusionPipeline.from_pretrained`] to load the DreamBooth model (feel free to browse the [Stable Diffusion Dreambooth Concepts Library](https://huggingface.co/sd-dreambooth-library) for 100+ trained models):
|
||||
|
||||
@@ -221,7 +420,7 @@ image
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-dreambooth.png"/>
|
||||
</div>
|
||||
|
||||
## Stable Diffusion XL
|
||||
### Stable Diffusion XL
|
||||
|
||||
Stable Diffusion XL (SDXL) has two tokenizers and text encoders so it's usage is a bit different. To address this, you should pass both tokenizers and encoders to the `Compel` class:
|
||||
|
||||
|
||||
@@ -49,7 +49,7 @@ prompt = "portrait photo of a old warrior chief"
|
||||
pipeline = pipeline.to("cuda")
|
||||
```
|
||||
|
||||
同じイメージを使って改良できるようにするには、[`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)を使い、[reproducibility](./using-diffusers/reproducibility)の種を設定します:
|
||||
同じイメージを使って改良できるようにするには、[`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)を使い、[reproducibility](./using-diffusers/reusing_seeds)の種を設定します:
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# 메모리와 속도
|
||||
|
||||
메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다.
|
||||
메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다.
|
||||
일반적으로, memory-efficient attention을 위해 [xFormers](https://github.com/facebookresearch/xformers) 사용을 추천하기 때문에, 추천하는 [설치 방법](xformers)을 보고 설치해 보세요.
|
||||
|
||||
다음 설정이 성능과 메모리에 미치는 영향에 대해 설명합니다.
|
||||
@@ -27,7 +27,7 @@ specific language governing permissions and limitations under the License.
|
||||
| memory-efficient attention | 2.63s | x3.61 |
|
||||
|
||||
<em>
|
||||
NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다.
|
||||
NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다.
|
||||
</em>
|
||||
|
||||
## cuDNN auto-tuner 활성화하기
|
||||
@@ -44,11 +44,11 @@ torch.backends.cudnn.benchmark = True
|
||||
|
||||
### fp32 대신 tf32 사용하기 (Ampere 및 이후 CUDA 장치들에서)
|
||||
|
||||
Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다.
|
||||
기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다.
|
||||
네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다.
|
||||
이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다.
|
||||
그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다.
|
||||
Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다.
|
||||
기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다.
|
||||
네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다.
|
||||
이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다.
|
||||
그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다.
|
||||
추론하기 전에 다음을 추가하기만 하면 됩니다:
|
||||
|
||||
```python
|
||||
@@ -59,13 +59,13 @@ torch.backends.cuda.matmul.allow_tf32 = True
|
||||
|
||||
## 반정밀도 가중치
|
||||
|
||||
더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다.
|
||||
더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다.
|
||||
여기에는 `fp16`이라는 브랜치에 저장된 float16 버전의 가중치를 불러오고, 그 때 `float16` 유형을 사용하도록 PyTorch에 지시하는 작업이 포함됩니다.
|
||||
|
||||
```Python
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
@@ -75,7 +75,7 @@ image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.
|
||||
어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.
|
||||
</Tip>
|
||||
|
||||
## 추가 메모리 절약을 위한 슬라이스 어텐션
|
||||
@@ -95,7 +95,7 @@ from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
@@ -122,7 +122,7 @@ from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
@@ -148,7 +148,7 @@ from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
@@ -165,7 +165,7 @@ image = pipe(prompt).images[0]
|
||||
또 다른 최적화 방법인 <a href="#model_offloading">모델 오프로딩</a>을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다.
|
||||
</Tip>
|
||||
|
||||
또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.
|
||||
또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.
|
||||
|
||||
|
||||
```Python
|
||||
@@ -174,7 +174,7 @@ from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
@@ -204,7 +204,7 @@ import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
@@ -355,7 +355,7 @@ unet_traced = torch.jit.load("unet_traced.pt")
|
||||
class TracedUNet(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.in_channels = pipe.unet.in_channels
|
||||
self.in_channels = pipe.unet.config.in_channels
|
||||
self.device = pipe.unet.device
|
||||
|
||||
def forward(self, latent_model_input, t, encoder_hidden_states):
|
||||
@@ -387,7 +387,7 @@ with torch.inference_mode():
|
||||
| A100-SXM4-40GB | 18.6it/s | 29.it/s |
|
||||
| A100-SXM-80GB | 18.7it/s | 29.5it/s |
|
||||
|
||||
이를 활용하려면 다음을 만족해야 합니다:
|
||||
이를 활용하려면 다음을 만족해야 합니다:
|
||||
- PyTorch > 1.12
|
||||
- Cuda 사용 가능
|
||||
- [xformers 라이브러리를 설치함](xformers)
|
||||
|
||||
@@ -49,7 +49,7 @@ prompt = "portrait photo of a old warrior chief"
|
||||
pipeline = pipeline.to("cuda")
|
||||
```
|
||||
|
||||
동일한 이미지를 사용하고 개선할 수 있는지 확인하려면 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)를 사용하고 [재현성](./using-diffusers/reproducibility)에 대한 시드를 설정하세요:
|
||||
동일한 이미지를 사용하고 개선할 수 있는지 확인하려면 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)를 사용하고 [재현성](./using-diffusers/reusing_seeds)에 대한 시드를 설정하세요:
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다.
|
||||
🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다.
|
||||
|
||||
이 튜토리얼에서는 기본 파이프라인부터 시작해 Stable Diffusion 파이프라인까지 진행하며 모델과 스케줄러를 사용해 추론을 위한 diffusion 시스템을 조립하는 방법을 배웁니다.
|
||||
|
||||
@@ -36,7 +36,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
정말 쉽습니다. 그런데 파이프라인은 어떻게 이렇게 할 수 있었을까요? 파이프라인을 세분화하여 내부에서 어떤 일이 일어나고 있는지 살펴보겠습니다.
|
||||
|
||||
위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다.
|
||||
위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다.
|
||||
|
||||
모델과 스케줄러를 별도로 사용하여 파이프라인을 다시 생성하기 위해 자체적인 노이즈 제거 프로세스를 작성해 보겠습니다.
|
||||
|
||||
@@ -210,7 +210,7 @@ Stable Diffusion 은 text-to-image *latent diffusion* 모델입니다. latent di
|
||||
|
||||
```py
|
||||
>>> latents = torch.randn(
|
||||
... (batch_size, unet.in_channels, height // 8, width // 8),
|
||||
... (batch_size, unet.config.in_channels, height // 8, width // 8),
|
||||
... generator=generator,
|
||||
... device=torch_device,
|
||||
... )
|
||||
|
||||
@@ -51,7 +51,7 @@ prompt = "portrait photo of a old warrior chief"
|
||||
pipeline = pipeline.to("cuda")
|
||||
```
|
||||
|
||||
为了确保您可以使用相同的图像并对其进行改进,使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法,然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reproducibility):
|
||||
为了确保您可以使用相同的图像并对其进行改进,使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法,然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reusing_seeds):
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
@@ -42,7 +42,7 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
|
||||
| [**Dreambooth**](./dreambooth) | ✅ | - | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
|
||||
| [**ControlNet**](./controlnet) | ✅ | ✅ | -
|
||||
| [**InstructPix2Pix**](./instruct_pix2pix) | ✅ | ✅ | -
|
||||
| [**Reinforcement Learning for Control**](https://github.com/huggingface/diffusers/blob/main/examples/reinforcement_learning/run_diffusers_locomotion.py) | - | - | coming soon.
|
||||
| [**Reinforcement Learning for Control**](./reinforcement_learning) | - | - | coming soon.
|
||||
|
||||
## Community
|
||||
|
||||
|
||||
@@ -234,7 +234,7 @@ In ComfyUI we will load a LoRA and a textual embedding at the same time.
|
||||
SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
|
||||
|
||||
### DoRA training
|
||||
The advanced script now supports DoRA training too!
|
||||
The advanced script supports DoRA training too!
|
||||
> Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353),
|
||||
**DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters.
|
||||
The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference.
|
||||
@@ -304,10 +304,151 @@ accelerate launch train_dreambooth_lora_sdxl_advanced.py \
|
||||
> [!CAUTION]
|
||||
> Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant".
|
||||
|
||||
### B-LoRA training
|
||||
The advanced script now supports B-LoRA training too!
|
||||
> Proposed in [Implicit Style-Content Separation using B-LoRA](https://arxiv.org/abs/2403.14572),
|
||||
B-LoRA is a method that leverages LoRA to implicitly separate the style and content components of a **single** image.
|
||||
It was shown that learning the LoRA weights of two specific blocks (referred to as B-LoRAs)
|
||||
achieves style-content separation that cannot be achieved by training each B-LoRA independently.
|
||||
Once trained, the two B-LoRAs can be used as independent components to allow various image stylization tasks
|
||||
|
||||
**Usage**
|
||||
Enable B-LoRA training by adding this flag
|
||||
```bash
|
||||
--use_blora
|
||||
```
|
||||
You can train a B-LoRA with as little as 1 image, and 1000 steps. Try this default configuration as a start:
|
||||
```bash
|
||||
!accelerate launch train_dreambooth_b-lora_sdxl.py \
|
||||
--pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \
|
||||
--instance_data_dir="linoyts/B-LoRA_teddy_bear" \
|
||||
--output_dir="B-LoRA_teddy_bear" \
|
||||
--instance_prompt="a [v18]" \
|
||||
--resolution=1024 \
|
||||
--rank=64 \
|
||||
--train_batch_size=1 \
|
||||
--learning_rate=5e-5 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--max_train_steps=1000 \
|
||||
--checkpointing_steps=2000 \
|
||||
--seed="0" \
|
||||
--gradient_checkpointing \
|
||||
--mixed_precision="fp16"
|
||||
```
|
||||
**Inference**
|
||||
The inference is a bit different:
|
||||
1. we need load *specific* unet layers (as opposed to a regular LoRA/DoRA)
|
||||
2. the trained layers we load, changes based on our objective (e.g. style/content)
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline, AutoencoderKL
|
||||
|
||||
# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
|
||||
def is_belong_to_blocks(key, blocks):
|
||||
try:
|
||||
for g in blocks:
|
||||
if g in key:
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
raise type(e)(f'failed to is_belong_to_block, due to: {e}')
|
||||
|
||||
def lora_lora_unet_blocks(lora_path, alpha, target_blocks):
|
||||
state_dict, _ = pipeline.lora_state_dict(lora_path)
|
||||
filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)}
|
||||
return filtered_state_dict
|
||||
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
vae=vae,
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
# pick a blora for content/style (you can also set one to None)
|
||||
content_B_lora_path = "lora-library/B-LoRA-teddybear"
|
||||
style_B_lora_path= "lora-library/B-LoRA-pen_sketch"
|
||||
|
||||
|
||||
content_B_LoRA = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"])
|
||||
style_B_LoRA = lora_lora_unet_blocks(style_B_lora_path,alpha=1.1,target_blocks=["unet.up_blocks.0.attentions.1"])
|
||||
combined_lora = {**content_B_LoRA, **style_B_LoRA}
|
||||
|
||||
# Load both loras
|
||||
pipeline.load_lora_into_unet(combined_lora, None, pipeline.unet)
|
||||
|
||||
#generate
|
||||
prompt = "a [v18] in [v30] style"
|
||||
pipeline(prompt, num_images_per_prompt=4).images
|
||||
```
|
||||
### LoRA training of Targeted U-net Blocks
|
||||
The advanced script now supports custom choice of U-net blocks to train during Dreambooth LoRA tuning.
|
||||
> [!NOTE]
|
||||
> This feature is still experimental
|
||||
|
||||
> Recently, works like B-LoRA showed the potential advantages of learning the LoRA weights of specific U-net blocks, not only in speed & memory,
|
||||
> but also in reducing the amount of needed data, improving style manipulation and overcoming overfitting issues.
|
||||
> In light of this, we're introducing a new feature to the advanced script to allow for configurable U-net learned blocks.
|
||||
|
||||
**Usage**
|
||||
Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma seperated string specifying the targeted blocks.
|
||||
e.g:
|
||||
```bash
|
||||
--lora_unet_blocks="unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1"
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> if you specify both `--use_blora` and `--lora_unet_blocks`, values given in --lora_unet_blocks will be ignored.
|
||||
> When enabling --use_blora, targeted U-net blocks are automatically set to be "unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1" as discussed in the paper.
|
||||
> If you wish to experiment with different blocks, specify `--lora_unet_blocks` only.
|
||||
|
||||
**Inference**
|
||||
Inference is the same as for B-LoRAs, except the input targeted blocks should be modified based on your training configuration.
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline, AutoencoderKL
|
||||
|
||||
# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
|
||||
def is_belong_to_blocks(key, blocks):
|
||||
try:
|
||||
for g in blocks:
|
||||
if g in key:
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
raise type(e)(f'failed to is_belong_to_block, due to: {e}')
|
||||
|
||||
def lora_lora_unet_blocks(lora_path, alpha, target_blocks):
|
||||
state_dict, _ = pipeline.lora_state_dict(lora_path)
|
||||
filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)}
|
||||
return filtered_state_dict
|
||||
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
vae=vae,
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
lora_path = "lora-library/B-LoRA-pen_sketch"
|
||||
|
||||
state_dict = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"])
|
||||
|
||||
# Load traine dlora layers into the unet
|
||||
pipeline.load_lora_into_unet(state_dict, None, pipeline.unet)
|
||||
|
||||
#generate
|
||||
prompt = "a dog in [v30] style"
|
||||
pipeline(prompt, num_images_per_prompt=4).images
|
||||
```
|
||||
|
||||
|
||||
### Tips and Tricks
|
||||
Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)
|
||||
|
||||
## Running on Colab Notebook
|
||||
Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_advanced_example.ipynb).
|
||||
Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_Dreambooth_LoRA_advanced_example.ipynb).
|
||||
to train using the advanced features (including pivotal tuning), and [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb) to train on a free colab, using some of the advanced features (excluding pivotal tuning)
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ import os
|
||||
import re
|
||||
import shutil
|
||||
import warnings
|
||||
from contextlib import nullcontext
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
@@ -70,7 +71,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.27.0.dev0")
|
||||
check_min_version("0.28.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -656,7 +657,6 @@ def parse_args(input_args=None):
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_dora",
|
||||
type=bool,
|
||||
action="store_true",
|
||||
default=False,
|
||||
help=(
|
||||
@@ -1845,7 +1845,12 @@ def main(args):
|
||||
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
|
||||
pipeline_args = {"prompt": args.validation_prompt}
|
||||
|
||||
with torch.cuda.amp.autocast():
|
||||
if torch.backends.mps.is_available():
|
||||
autocast_ctx = nullcontext()
|
||||
else:
|
||||
autocast_ctx = torch.autocast(accelerator.device.type)
|
||||
|
||||
with autocast_ctx:
|
||||
images = [
|
||||
pipeline(**pipeline_args, generator=generator).images[0]
|
||||
for _ in range(args.num_validation_images)
|
||||
|
||||
@@ -14,9 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
|
||||
import argparse
|
||||
import contextlib
|
||||
import gc
|
||||
import hashlib
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
@@ -26,6 +24,7 @@ import random
|
||||
import re
|
||||
import shutil
|
||||
import warnings
|
||||
from contextlib import nullcontext
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
@@ -40,6 +39,7 @@ from accelerate import Accelerator
|
||||
from accelerate.logging import get_logger
|
||||
from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
|
||||
from huggingface_hub import create_repo, hf_hub_download, upload_folder
|
||||
from huggingface_hub.utils import insecure_hashlib
|
||||
from packaging import version
|
||||
from peft import LoraConfig, set_peft_model_state_dict
|
||||
from peft.utils import get_peft_model_state_dict
|
||||
@@ -78,7 +78,7 @@ from diffusers.utils.torch_utils import is_compiled_module
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.27.0.dev0")
|
||||
check_min_version("0.28.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -696,6 +696,23 @@ def parse_args(input_args=None):
|
||||
"Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lora_unet_blocks",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"the U-net blocks to tune during training. please specify them in a comma separated string, e.g. `unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1` etc."
|
||||
"NOTE: By default (if not specified) - regular LoRA training is performed. "
|
||||
"if --use_blora is enabled, this arg will be ignored, since in B-LoRA training, targeted U-net blocks are `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_blora",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether to train a B-LoRA as proposed in- Implicit Style-Content Separation using B-LoRA https://arxiv.org/abs/2403.14572. "
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_latents",
|
||||
action="store_true",
|
||||
@@ -720,6 +737,11 @@ def parse_args(input_args=None):
|
||||
"For full LoRA text encoder training check --train_text_encoder, for textual "
|
||||
"inversion training check `--train_text_encoder_ti`"
|
||||
)
|
||||
if args.use_blora and args.lora_unet_blocks:
|
||||
warnings.warn(
|
||||
"You specified both `--use_blora` and `--lora_unet_blocks`, for B-LoRA training, target unet blocks are: `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`. "
|
||||
"If you wish to target different U-net blocks, don't enable `--use_blora`"
|
||||
)
|
||||
|
||||
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
|
||||
if env_local_rank != -1 and env_local_rank != args.local_rank:
|
||||
@@ -740,6 +762,40 @@ def parse_args(input_args=None):
|
||||
return args
|
||||
|
||||
|
||||
# Taken (and slightly modified) from B-LoRA repo https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
|
||||
def is_belong_to_blocks(key, blocks):
|
||||
try:
|
||||
for g in blocks:
|
||||
if g in key:
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
raise type(e)(f"failed to is_belong_to_block, due to: {e}")
|
||||
|
||||
|
||||
def get_unet_lora_target_modules(unet, use_blora, target_blocks=None):
|
||||
if use_blora:
|
||||
content_b_lora_blocks = "unet.up_blocks.0.attentions.0"
|
||||
style_b_lora_blocks = "unet.up_blocks.0.attentions.1"
|
||||
target_blocks = [content_b_lora_blocks, style_b_lora_blocks]
|
||||
try:
|
||||
blocks = [(".").join(blk.split(".")[1:]) for blk in target_blocks]
|
||||
|
||||
attns = [
|
||||
attn_processor_name.rsplit(".", 1)[0]
|
||||
for attn_processor_name, _ in unet.attn_processors.items()
|
||||
if is_belong_to_blocks(attn_processor_name, blocks)
|
||||
]
|
||||
|
||||
target_modules = [f"{attn}.{mat}" for mat in ["to_k", "to_q", "to_v", "to_out.0"] for attn in attns]
|
||||
return target_modules
|
||||
except Exception as e:
|
||||
raise type(e)(
|
||||
f"failed to get_target_modules, due to: {e}. "
|
||||
f"Please check the modules specified in --lora_unet_blocks are correct"
|
||||
)
|
||||
|
||||
|
||||
# Taken from https://github.com/replicate/cog-sdxl/blob/main/dataset_and_utils.py
|
||||
class TokenEmbeddingsHandler:
|
||||
def __init__(self, text_encoders, tokenizers):
|
||||
@@ -946,16 +1002,20 @@ class DreamBoothDataset(Dataset):
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
]
|
||||
)
|
||||
# if using B-LoRA for single image. do not use transformations
|
||||
single_image = len(self.instance_images) < 2
|
||||
for image in self.instance_images:
|
||||
image = exif_transpose(image)
|
||||
if not single_image:
|
||||
image = exif_transpose(image)
|
||||
if not image.mode == "RGB":
|
||||
image = image.convert("RGB")
|
||||
self.original_sizes.append((image.height, image.width))
|
||||
image = train_resize(image)
|
||||
if args.random_flip and random.random() < 0.5:
|
||||
|
||||
if not single_image and args.random_flip and random.random() < 0.5:
|
||||
# flip
|
||||
image = train_flip(image)
|
||||
if args.center_crop:
|
||||
if args.center_crop or single_image:
|
||||
y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
|
||||
x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
|
||||
image = train_crop(image)
|
||||
@@ -1216,7 +1276,7 @@ def main(args):
|
||||
images = pipeline(example["prompt"]).images
|
||||
|
||||
for i, image in enumerate(images):
|
||||
hash_image = hashlib.sha1(image.tobytes()).hexdigest()
|
||||
hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
|
||||
image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
|
||||
image.save(image_filename)
|
||||
|
||||
@@ -1374,12 +1434,24 @@ def main(args):
|
||||
text_encoder_two.gradient_checkpointing_enable()
|
||||
|
||||
# now we will add new LoRA weights to the attention layers
|
||||
|
||||
if args.use_blora:
|
||||
# if using B-LoRA, the targeted blocks to train are automatically set
|
||||
target_modules = get_unet_lora_target_modules(unet, use_blora=True)
|
||||
elif args.lora_unet_blocks:
|
||||
# if training specific unet blocks not in the B-LoRA scheme
|
||||
target_blocks_list = "".join(args.lora_unet_blocks.split()).split(",")
|
||||
logger.info(f"list of unet blocks to train: {target_blocks_list}")
|
||||
target_modules = get_unet_lora_target_modules(unet, use_blora=False, target_blocks=target_blocks_list)
|
||||
else:
|
||||
target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
|
||||
|
||||
unet_lora_config = LoraConfig(
|
||||
r=args.rank,
|
||||
lora_alpha=args.rank,
|
||||
use_dora=args.use_dora,
|
||||
lora_alpha=args.rank,
|
||||
init_lora_weights="gaussian",
|
||||
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
|
||||
target_modules=target_modules,
|
||||
)
|
||||
unet.add_adapter(unet_lora_config)
|
||||
|
||||
@@ -1388,8 +1460,8 @@ def main(args):
|
||||
if args.train_text_encoder:
|
||||
text_lora_config = LoraConfig(
|
||||
r=args.rank,
|
||||
lora_alpha=args.rank,
|
||||
use_dora=args.use_dora,
|
||||
lora_alpha=args.rank,
|
||||
init_lora_weights="gaussian",
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
|
||||
)
|
||||
@@ -1505,6 +1577,7 @@ def main(args):
|
||||
models = [unet_]
|
||||
if args.train_text_encoder:
|
||||
models.extend([text_encoder_one_, text_encoder_two_])
|
||||
# only upcast trainable parameters (LoRA) into fp32
|
||||
cast_training_params(models)
|
||||
|
||||
accelerator.register_save_state_pre_hook(save_model_hook)
|
||||
@@ -1525,6 +1598,8 @@ def main(args):
|
||||
models = [unet]
|
||||
if args.train_text_encoder:
|
||||
models.extend([text_encoder_one, text_encoder_two])
|
||||
|
||||
# only upcast trainable parameters (LoRA) into fp32
|
||||
cast_training_params(models, dtype=torch.float32)
|
||||
|
||||
unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters()))
|
||||
@@ -1780,7 +1855,12 @@ def main(args):
|
||||
# We need to initialize the trackers we use, and also store our configuration.
|
||||
# The trackers initializes automatically on the main process.
|
||||
if accelerator.is_main_process:
|
||||
accelerator.init_trackers("dreambooth-lora-sd-xl", config=vars(args))
|
||||
tracker_name = (
|
||||
"dreambooth-lora-sd-xl"
|
||||
if "playground" not in args.pretrained_model_name_or_path
|
||||
else "dreambooth-lora-playground"
|
||||
)
|
||||
accelerator.init_trackers(tracker_name, config=vars(args))
|
||||
|
||||
# Train!
|
||||
total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
||||
@@ -1833,7 +1913,6 @@ def main(args):
|
||||
)
|
||||
|
||||
def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
|
||||
# TODO: revisit other sampling algorithms
|
||||
sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
|
||||
schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
|
||||
timesteps = timesteps.to(accelerator.device)
|
||||
@@ -1852,6 +1931,7 @@ def main(args):
|
||||
# flag used for textual inversion
|
||||
pivoted = False
|
||||
for epoch in range(first_epoch, args.num_train_epochs):
|
||||
unet.train()
|
||||
# if performing any kind of optimization of text_encoder params
|
||||
if args.train_text_encoder or args.train_text_encoder_ti:
|
||||
if epoch == num_train_epochs_text_encoder:
|
||||
@@ -1869,7 +1949,6 @@ def main(args):
|
||||
text_encoder_one.text_model.embeddings.requires_grad_(True)
|
||||
text_encoder_two.text_model.embeddings.requires_grad_(True)
|
||||
|
||||
unet.train()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
if pivoted:
|
||||
# stopping optimization of text_encoder params
|
||||
@@ -1970,7 +2049,8 @@ def main(args):
|
||||
timesteps,
|
||||
prompt_embeds_input,
|
||||
added_cond_kwargs=unet_added_conditions,
|
||||
).sample
|
||||
return_dict=False,
|
||||
)[0]
|
||||
else:
|
||||
unet_added_conditions = {"time_ids": add_time_ids}
|
||||
prompt_embeds, pooled_prompt_embeds = encode_prompt(
|
||||
@@ -1988,7 +2068,8 @@ def main(args):
|
||||
timesteps,
|
||||
prompt_embeds_input,
|
||||
added_cond_kwargs=unet_added_conditions,
|
||||
).sample
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
weighting = None
|
||||
if args.do_edm_style_training:
|
||||
@@ -2192,13 +2273,12 @@ def main(args):
|
||||
# run inference
|
||||
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
|
||||
pipeline_args = {"prompt": args.validation_prompt}
|
||||
inference_ctx = (
|
||||
contextlib.nullcontext()
|
||||
if "playground" in args.pretrained_model_name_or_path
|
||||
else torch.cuda.amp.autocast()
|
||||
)
|
||||
if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
|
||||
autocast_ctx = nullcontext()
|
||||
else:
|
||||
autocast_ctx = torch.autocast(accelerator.device.type)
|
||||
|
||||
with inference_ctx:
|
||||
with autocast_ctx:
|
||||
images = [
|
||||
pipeline(**pipeline_args, generator=generator).images[0]
|
||||
for _ in range(args.num_validation_images)
|
||||
|
||||
@@ -430,6 +430,9 @@ def main(args):
|
||||
log_with=args.report_to,
|
||||
project_config=accelerator_project_config,
|
||||
)
|
||||
# Disable AMP for MPS.
|
||||
if torch.backends.mps.is_available():
|
||||
accelerator.native_amp = False
|
||||
|
||||
if accelerator.is_main_process:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
232
examples/community/README_community_scripts.md
Normal file
232
examples/community/README_community_scripts.md
Normal file
@@ -0,0 +1,232 @@
|
||||
# Community Scripts
|
||||
|
||||
**Community scripts** consist of inference examples using Diffusers pipelines that have been added by the community.
|
||||
Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste code example that you can try out.
|
||||
If a community script doesn't work as expected, please open an issue and ping the author on it.
|
||||
|
||||
| Example | Description | Code Example | Colab | Author |
|
||||
|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
|
||||
| Using IP-Adapter with negative noise | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details) | [IP-Adapter Negative Noise](#ip-adapter-negative-noise) | | [Álvaro Somoza](https://github.com/asomoza)|
|
||||
| asymmetric tiling |configure seamless image tiling independently for the X and Y axes | [Asymmetric Tiling](#asymmetric-tiling ) | | [alexisrolland](https://github.com/alexisrolland)|
|
||||
|
||||
|
||||
## Example usages
|
||||
|
||||
### IP Adapter Negative Noise
|
||||
|
||||
Diffusers pipelines are fully integrated with IP-Adapter, which allows you to prompt the diffusion model with an image. However, it does not support negative image prompts (there is no `negative_ip_adapter_image` argument) the same way it supports negative text prompts. When you pass an `ip_adapter_image,` it will create a zero-filled tensor as a negative image. This script shows you how to create a negative noise from `ip_adapter_image` and use it to significantly improve the generation quality while preserving the composition of images.
|
||||
|
||||
[cubiq](https://github.com/cubiq) initially developed this feature in his [repository](https://github.com/cubiq/ComfyUI_IPAdapter_plus). The community script was contributed by [asomoza](https://github.com/Somoza). You can find more details about this experimentation [this discussion](https://github.com/huggingface/diffusers/discussions/7167)
|
||||
|
||||
IP-Adapter without negative noise
|
||||
|source|result|
|
||||
|---|---|
|
||||
|||
|
||||
|
||||
IP-Adapter with negative noise
|
||||
|source|result|
|
||||
|---|---|
|
||||
|||
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, StableDiffusionXLPipeline
|
||||
from diffusers.models import ImageProjection
|
||||
from diffusers.utils import load_image
|
||||
|
||||
|
||||
def encode_image(
|
||||
image_encoder,
|
||||
feature_extractor,
|
||||
image,
|
||||
device,
|
||||
num_images_per_prompt,
|
||||
output_hidden_states=None,
|
||||
negative_image=None,
|
||||
):
|
||||
dtype = next(image_encoder.parameters()).dtype
|
||||
|
||||
if not isinstance(image, torch.Tensor):
|
||||
image = feature_extractor(image, return_tensors="pt").pixel_values
|
||||
|
||||
image = image.to(device=device, dtype=dtype)
|
||||
if output_hidden_states:
|
||||
image_enc_hidden_states = image_encoder(image, output_hidden_states=True).hidden_states[-2]
|
||||
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
|
||||
if negative_image is None:
|
||||
uncond_image_enc_hidden_states = image_encoder(
|
||||
torch.zeros_like(image), output_hidden_states=True
|
||||
).hidden_states[-2]
|
||||
else:
|
||||
if not isinstance(negative_image, torch.Tensor):
|
||||
negative_image = feature_extractor(negative_image, return_tensors="pt").pixel_values
|
||||
negative_image = negative_image.to(device=device, dtype=dtype)
|
||||
uncond_image_enc_hidden_states = image_encoder(negative_image, output_hidden_states=True).hidden_states[-2]
|
||||
|
||||
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
return image_enc_hidden_states, uncond_image_enc_hidden_states
|
||||
else:
|
||||
image_embeds = image_encoder(image).image_embeds
|
||||
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
uncond_image_embeds = torch.zeros_like(image_embeds)
|
||||
|
||||
return image_embeds, uncond_image_embeds
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def prepare_ip_adapter_image_embeds(
|
||||
unet,
|
||||
image_encoder,
|
||||
feature_extractor,
|
||||
ip_adapter_image,
|
||||
do_classifier_free_guidance,
|
||||
device,
|
||||
num_images_per_prompt,
|
||||
ip_adapter_negative_image=None,
|
||||
):
|
||||
if not isinstance(ip_adapter_image, list):
|
||||
ip_adapter_image = [ip_adapter_image]
|
||||
|
||||
if len(ip_adapter_image) != len(unet.encoder_hid_proj.image_projection_layers):
|
||||
raise ValueError(
|
||||
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
||||
)
|
||||
|
||||
image_embeds = []
|
||||
for single_ip_adapter_image, image_proj_layer in zip(
|
||||
ip_adapter_image, unet.encoder_hid_proj.image_projection_layers
|
||||
):
|
||||
output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
|
||||
single_image_embeds, single_negative_image_embeds = encode_image(
|
||||
image_encoder,
|
||||
feature_extractor,
|
||||
single_ip_adapter_image,
|
||||
device,
|
||||
1,
|
||||
output_hidden_state,
|
||||
negative_image=ip_adapter_negative_image,
|
||||
)
|
||||
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
||||
single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0)
|
||||
|
||||
if do_classifier_free_guidance:
|
||||
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
||||
single_image_embeds = single_image_embeds.to(device)
|
||||
|
||||
image_embeds.append(single_image_embeds)
|
||||
|
||||
return image_embeds
|
||||
|
||||
|
||||
vae = AutoencoderKL.from_pretrained(
|
||||
"madebyollin/sdxl-vae-fp16-fix",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"RunDiffusion/Juggernaut-XL-v9",
|
||||
torch_dtype=torch.float16,
|
||||
vae=vae,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.scheduler.config.use_karras_sigmas = True
|
||||
|
||||
pipeline.load_ip_adapter(
|
||||
"h94/IP-Adapter",
|
||||
subfolder="sdxl_models",
|
||||
weight_name="ip-adapter-plus_sdxl_vit-h.safetensors",
|
||||
image_encoder_folder="models/image_encoder",
|
||||
)
|
||||
pipeline.set_ip_adapter_scale(0.7)
|
||||
|
||||
ip_image = load_image("source.png")
|
||||
negative_ip_image = load_image("noise.png")
|
||||
|
||||
image_embeds = prepare_ip_adapter_image_embeds(
|
||||
unet=pipeline.unet,
|
||||
image_encoder=pipeline.image_encoder,
|
||||
feature_extractor=pipeline.feature_extractor,
|
||||
ip_adapter_image=[[ip_image]],
|
||||
do_classifier_free_guidance=True,
|
||||
device="cuda",
|
||||
num_images_per_prompt=1,
|
||||
ip_adapter_negative_image=negative_ip_image,
|
||||
)
|
||||
|
||||
|
||||
prompt = "cinematic photo of a cyborg in the city, 4k, high quality, intricate, highly detailed"
|
||||
negative_prompt = "blurry, smooth, plastic"
|
||||
|
||||
image = pipeline(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
ip_adapter_image_embeds=image_embeds,
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=25,
|
||||
generator=torch.Generator(device="cpu").manual_seed(1556265306),
|
||||
).images[0]
|
||||
|
||||
image.save("result.png")
|
||||
```
|
||||
|
||||
### Asymmetric Tiling
|
||||
Stable Diffusion is not trained to generate seamless textures. However, you can use this simple script to add tiling to your generation. This script is contributed by [alexisrolland](https://github.com/alexisrolland). See more details in the [this issue](https://github.com/huggingface/diffusers/issues/556)
|
||||
|
||||
|
||||
|Generated|Tiled|
|
||||
|---|---|
|
||||
|||
|
||||
|
||||
|
||||
```py
|
||||
import torch
|
||||
from typing import Optional
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from diffusers.models.lora import LoRACompatibleConv
|
||||
|
||||
def seamless_tiling(pipeline, x_axis, y_axis):
|
||||
def asymmetric_conv2d_convforward(self, input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
|
||||
self.paddingX = (self._reversed_padding_repeated_twice[0], self._reversed_padding_repeated_twice[1], 0, 0)
|
||||
self.paddingY = (0, 0, self._reversed_padding_repeated_twice[2], self._reversed_padding_repeated_twice[3])
|
||||
working = torch.nn.functional.pad(input, self.paddingX, mode=x_mode)
|
||||
working = torch.nn.functional.pad(working, self.paddingY, mode=y_mode)
|
||||
return torch.nn.functional.conv2d(working, weight, bias, self.stride, torch.nn.modules.utils._pair(0), self.dilation, self.groups)
|
||||
x_mode = 'circular' if x_axis else 'constant'
|
||||
y_mode = 'circular' if y_axis else 'constant'
|
||||
targets = [pipeline.vae, pipeline.text_encoder, pipeline.unet]
|
||||
convolution_layers = []
|
||||
for target in targets:
|
||||
for module in target.modules():
|
||||
if isinstance(module, torch.nn.Conv2d):
|
||||
convolution_layers.append(module)
|
||||
for layer in convolution_layers:
|
||||
if isinstance(layer, LoRACompatibleConv) and layer.lora_layer is None:
|
||||
layer.lora_layer = lambda * x: 0
|
||||
layer._conv_forward = asymmetric_conv2d_convforward.__get__(layer, torch.nn.Conv2d)
|
||||
return pipeline
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
prompt = ["texture of a red brick wall"]
|
||||
seed = 123456
|
||||
generator = torch.Generator(device='cuda').manual_seed(seed)
|
||||
|
||||
pipeline = seamless_tiling(pipeline=pipeline, x_axis=True, y_axis=True)
|
||||
image = pipeline(
|
||||
prompt=prompt,
|
||||
width=512,
|
||||
height=512,
|
||||
num_inference_steps=20,
|
||||
guidance_scale=7,
|
||||
num_images_per_prompt=1,
|
||||
generator=generator
|
||||
).images[0]
|
||||
seamless_tiling(pipeline=pipeline, x_axis=False, y_axis=False)
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
image.save('image.png')
|
||||
```
|
||||
@@ -103,7 +103,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
|
||||
print(f"Combining with alpha={alpha}, interpolation mode={interp}")
|
||||
|
||||
checkpoint_count = len(pretrained_model_name_or_path_list)
|
||||
# Ignore result from model_index_json comparision of the two checkpoints
|
||||
# Ignore result from model_index_json comparison of the two checkpoints
|
||||
force = kwargs.pop("force", False)
|
||||
|
||||
# If less than 2 checkpoints, nothing to merge. If more than 3, not supported for now.
|
||||
@@ -217,7 +217,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
|
||||
]
|
||||
checkpoint_path_2 = files[0] if len(files) > 0 else None
|
||||
# For an attr if both checkpoint_path_1 and 2 are None, ignore.
|
||||
# If atleast one is present, deal with it according to interp method, of course only if the state_dict keys match.
|
||||
# If at least one is present, deal with it according to interp method, of course only if the state_dict keys match.
|
||||
if checkpoint_path_1 is None and checkpoint_path_2 is None:
|
||||
print(f"Skipping {attr}: not present in 2nd or 3d model")
|
||||
continue
|
||||
|
||||
@@ -359,9 +359,16 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
|
||||
|
||||
# Preprocess image
|
||||
image = preprocess(image, width, height)
|
||||
latents = self.prepare_latents(
|
||||
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, self.device, generator
|
||||
)
|
||||
if latents is None:
|
||||
latents = self.prepare_latents(
|
||||
image,
|
||||
latent_timestep,
|
||||
batch_size,
|
||||
num_images_per_prompt,
|
||||
text_embeddings.dtype,
|
||||
self.device,
|
||||
generator,
|
||||
)
|
||||
|
||||
if clip_guidance_scale > 0:
|
||||
if clip_prompt is not None:
|
||||
|
||||
@@ -321,7 +321,12 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin)
|
||||
)
|
||||
|
||||
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if latents is None:
|
||||
if device.type == "mps":
|
||||
# randn does not work reproducibly on mps
|
||||
|
||||
@@ -500,7 +500,12 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
|
||||
)
|
||||
|
||||
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
|
||||
994
examples/community/hd_painter.py
Normal file
994
examples/community/hd_painter.py
Normal file
@@ -0,0 +1,994 @@
|
||||
import math
|
||||
import numbers
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from diffusers.image_processor import PipelineImageInput
|
||||
from diffusers.models import AsymmetricAutoencoderKL, ImageProjection
|
||||
from diffusers.models.attention_processor import Attention, AttnProcessor
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import (
|
||||
StableDiffusionInpaintPipeline,
|
||||
retrieve_timesteps,
|
||||
)
|
||||
from diffusers.utils import deprecate
|
||||
|
||||
|
||||
class RASGAttnProcessor:
|
||||
def __init__(self, mask, token_idx, scale_factor):
|
||||
self.attention_scores = None # Stores the last output of the similarity matrix here. Each layer will get its own RASGAttnProcessor assigned
|
||||
self.mask = mask
|
||||
self.token_idx = token_idx
|
||||
self.scale_factor = scale_factor
|
||||
self.mask_resoltuion = mask.shape[-1] * mask.shape[-2] # 64 x 64 if the image is 512x512
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
attn: Attention,
|
||||
hidden_states: torch.FloatTensor,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
temb: Optional[torch.FloatTensor] = None,
|
||||
scale: float = 1.0,
|
||||
) -> torch.Tensor:
|
||||
# Same as the default AttnProcessor up untill the part where similarity matrix gets saved
|
||||
downscale_factor = self.mask_resoltuion // hidden_states.shape[1]
|
||||
residual = hidden_states
|
||||
|
||||
if attn.spatial_norm is not None:
|
||||
hidden_states = attn.spatial_norm(hidden_states, temb)
|
||||
|
||||
input_ndim = hidden_states.ndim
|
||||
|
||||
if input_ndim == 4:
|
||||
batch_size, channel, height, width = hidden_states.shape
|
||||
hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
|
||||
|
||||
batch_size, sequence_length, _ = (
|
||||
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
|
||||
)
|
||||
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
|
||||
|
||||
if attn.group_norm is not None:
|
||||
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
|
||||
|
||||
query = attn.to_q(hidden_states)
|
||||
|
||||
if encoder_hidden_states is None:
|
||||
encoder_hidden_states = hidden_states
|
||||
elif attn.norm_cross:
|
||||
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
|
||||
|
||||
key = attn.to_k(encoder_hidden_states)
|
||||
value = attn.to_v(encoder_hidden_states)
|
||||
|
||||
query = attn.head_to_batch_dim(query)
|
||||
key = attn.head_to_batch_dim(key)
|
||||
value = attn.head_to_batch_dim(value)
|
||||
|
||||
# Automatically recognize the resolution and save the attention similarity values
|
||||
# We need to use the values before the softmax function, hence the rewritten get_attention_scores function.
|
||||
if downscale_factor == self.scale_factor**2:
|
||||
self.attention_scores = get_attention_scores(attn, query, key, attention_mask)
|
||||
attention_probs = self.attention_scores.softmax(dim=-1)
|
||||
attention_probs = attention_probs.to(query.dtype)
|
||||
else:
|
||||
attention_probs = attn.get_attention_scores(query, key, attention_mask) # Original code
|
||||
|
||||
hidden_states = torch.bmm(attention_probs, value)
|
||||
hidden_states = attn.batch_to_head_dim(hidden_states)
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
if input_ndim == 4:
|
||||
hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
|
||||
|
||||
if attn.residual_connection:
|
||||
hidden_states = hidden_states + residual
|
||||
|
||||
hidden_states = hidden_states / attn.rescale_output_factor
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class PAIntAAttnProcessor:
|
||||
def __init__(self, transformer_block, mask, token_idx, do_classifier_free_guidance, scale_factors):
|
||||
self.transformer_block = transformer_block # Stores the parent transformer block.
|
||||
self.mask = mask
|
||||
self.scale_factors = scale_factors
|
||||
self.do_classifier_free_guidance = do_classifier_free_guidance
|
||||
self.token_idx = token_idx
|
||||
self.shape = mask.shape[2:]
|
||||
self.mask_resoltuion = mask.shape[-1] * mask.shape[-2] # 64 x 64
|
||||
self.default_processor = AttnProcessor()
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
attn: Attention,
|
||||
hidden_states: torch.FloatTensor,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
temb: Optional[torch.FloatTensor] = None,
|
||||
scale: float = 1.0,
|
||||
) -> torch.Tensor:
|
||||
# Automatically recognize the resolution of the current attention layer and resize the masks accordingly
|
||||
downscale_factor = self.mask_resoltuion // hidden_states.shape[1]
|
||||
|
||||
mask = None
|
||||
for factor in self.scale_factors:
|
||||
if downscale_factor == factor**2:
|
||||
shape = (self.shape[0] // factor, self.shape[1] // factor)
|
||||
mask = F.interpolate(self.mask, shape, mode="bicubic") # B, 1, H, W
|
||||
break
|
||||
if mask is None:
|
||||
return self.default_processor(attn, hidden_states, encoder_hidden_states, attention_mask, temb, scale)
|
||||
|
||||
# STARTS HERE
|
||||
residual = hidden_states
|
||||
# Save the input hidden_states for later use
|
||||
input_hidden_states = hidden_states
|
||||
|
||||
# ================================================== #
|
||||
# =============== SELF ATTENTION 1 ================= #
|
||||
# ================================================== #
|
||||
|
||||
if attn.spatial_norm is not None:
|
||||
hidden_states = attn.spatial_norm(hidden_states, temb)
|
||||
|
||||
input_ndim = hidden_states.ndim
|
||||
|
||||
if input_ndim == 4:
|
||||
batch_size, channel, height, width = hidden_states.shape
|
||||
hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
|
||||
|
||||
batch_size, sequence_length, _ = (
|
||||
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
|
||||
)
|
||||
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
|
||||
|
||||
if attn.group_norm is not None:
|
||||
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
|
||||
|
||||
query = attn.to_q(hidden_states)
|
||||
|
||||
if encoder_hidden_states is None:
|
||||
encoder_hidden_states = hidden_states
|
||||
elif attn.norm_cross:
|
||||
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
|
||||
|
||||
key = attn.to_k(encoder_hidden_states)
|
||||
value = attn.to_v(encoder_hidden_states)
|
||||
|
||||
query = attn.head_to_batch_dim(query)
|
||||
key = attn.head_to_batch_dim(key)
|
||||
value = attn.head_to_batch_dim(value)
|
||||
|
||||
# self_attention_probs = attn.get_attention_scores(query, key, attention_mask) # We can't use post-softmax attention scores in this case
|
||||
self_attention_scores = get_attention_scores(
|
||||
attn, query, key, attention_mask
|
||||
) # The custom function returns pre-softmax probabilities
|
||||
self_attention_probs = self_attention_scores.softmax(
|
||||
dim=-1
|
||||
) # Manually compute the probabilities here, the scores will be reused in the second part of PAIntA
|
||||
self_attention_probs = self_attention_probs.to(query.dtype)
|
||||
|
||||
hidden_states = torch.bmm(self_attention_probs, value)
|
||||
hidden_states = attn.batch_to_head_dim(hidden_states)
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
# x = x + self.attn1(self.norm1(x))
|
||||
|
||||
if input_ndim == 4:
|
||||
hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
|
||||
|
||||
if attn.residual_connection: # So many residuals everywhere
|
||||
hidden_states = hidden_states + residual
|
||||
|
||||
self_attention_output_hidden_states = hidden_states / attn.rescale_output_factor
|
||||
|
||||
# ================================================== #
|
||||
# ============ BasicTransformerBlock =============== #
|
||||
# ================================================== #
|
||||
# We use a hack by running the code from the BasicTransformerBlock that is between Self and Cross attentions here
|
||||
# The other option would've been modifying the BasicTransformerBlock and adding this functionality here.
|
||||
# I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack isntead.
|
||||
|
||||
# The SelfAttention block recieves the normalized latents from the BasicTransformerBlock,
|
||||
# But the residual of the output is the non-normalized version.
|
||||
# Therefore we unnormalize the input hidden state here
|
||||
unnormalized_input_hidden_states = (
|
||||
input_hidden_states + self.transformer_block.norm1.bias
|
||||
) * self.transformer_block.norm1.weight
|
||||
|
||||
# TODO: return if neccessary
|
||||
# if self.use_ada_layer_norm_zero:
|
||||
# attn_output = gate_msa.unsqueeze(1) * attn_output
|
||||
# elif self.use_ada_layer_norm_single:
|
||||
# attn_output = gate_msa * attn_output
|
||||
|
||||
transformer_hidden_states = self_attention_output_hidden_states + unnormalized_input_hidden_states
|
||||
if transformer_hidden_states.ndim == 4:
|
||||
transformer_hidden_states = transformer_hidden_states.squeeze(1)
|
||||
|
||||
# TODO: return if neccessary
|
||||
# 2.5 GLIGEN Control
|
||||
# if gligen_kwargs is not None:
|
||||
# transformer_hidden_states = self.fuser(transformer_hidden_states, gligen_kwargs["objs"])
|
||||
# NOTE: we experimented with using GLIGEN and HDPainter together, the results were not that great
|
||||
|
||||
# 3. Cross-Attention
|
||||
if self.transformer_block.use_ada_layer_norm:
|
||||
# transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states, timestep)
|
||||
raise NotImplementedError()
|
||||
elif self.transformer_block.use_ada_layer_norm_zero or self.transformer_block.use_layer_norm:
|
||||
transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states)
|
||||
elif self.transformer_block.use_ada_layer_norm_single:
|
||||
# For PixArt norm2 isn't applied here:
|
||||
# https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
|
||||
transformer_norm_hidden_states = transformer_hidden_states
|
||||
elif self.transformer_block.use_ada_layer_norm_continuous:
|
||||
# transformer_norm_hidden_states = self.transformer_block.norm2(transformer_hidden_states, added_cond_kwargs["pooled_text_emb"])
|
||||
raise NotImplementedError()
|
||||
else:
|
||||
raise ValueError("Incorrect norm")
|
||||
|
||||
if self.transformer_block.pos_embed is not None and self.transformer_block.use_ada_layer_norm_single is False:
|
||||
transformer_norm_hidden_states = self.transformer_block.pos_embed(transformer_norm_hidden_states)
|
||||
|
||||
# ================================================== #
|
||||
# ================= CROSS ATTENTION ================ #
|
||||
# ================================================== #
|
||||
|
||||
# We do an initial pass of the CrossAttention up to obtaining the similarity matrix here.
|
||||
# The similarity matrix is used to obtain scaling coefficients for the attention matrix of the self attention
|
||||
# We reuse the previously computed self-attention matrix, and only repeat the steps after the softmax
|
||||
|
||||
cross_attention_input_hidden_states = (
|
||||
transformer_norm_hidden_states # Renaming the variable for the sake of readability
|
||||
)
|
||||
|
||||
# TODO: check if classifier_free_guidance is being used before splitting here
|
||||
if self.do_classifier_free_guidance:
|
||||
# Our scaling coefficients depend only on the conditional part, so we split the inputs
|
||||
(
|
||||
_cross_attention_input_hidden_states_unconditional,
|
||||
cross_attention_input_hidden_states_conditional,
|
||||
) = cross_attention_input_hidden_states.chunk(2)
|
||||
|
||||
# Same split for the encoder_hidden_states i.e. the tokens
|
||||
# Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the begining.
|
||||
_encoder_hidden_states_unconditional, encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(
|
||||
2
|
||||
)
|
||||
else:
|
||||
cross_attention_input_hidden_states_conditional = cross_attention_input_hidden_states
|
||||
encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(2)
|
||||
|
||||
# Rename the variables for the sake of readability
|
||||
# The part below is the beginning of the __call__ function of the following CrossAttention layer
|
||||
cross_attention_hidden_states = cross_attention_input_hidden_states_conditional
|
||||
cross_attention_encoder_hidden_states = encoder_hidden_states_conditional
|
||||
|
||||
attn2 = self.transformer_block.attn2
|
||||
|
||||
if attn2.spatial_norm is not None:
|
||||
cross_attention_hidden_states = attn2.spatial_norm(cross_attention_hidden_states, temb)
|
||||
|
||||
input_ndim = cross_attention_hidden_states.ndim
|
||||
|
||||
if input_ndim == 4:
|
||||
batch_size, channel, height, width = cross_attention_hidden_states.shape
|
||||
cross_attention_hidden_states = cross_attention_hidden_states.view(
|
||||
batch_size, channel, height * width
|
||||
).transpose(1, 2)
|
||||
|
||||
(
|
||||
batch_size,
|
||||
sequence_length,
|
||||
_,
|
||||
) = cross_attention_hidden_states.shape # It is definitely a cross attention, so no need for an if block
|
||||
# TODO: change the attention_mask here
|
||||
attention_mask = attn2.prepare_attention_mask(
|
||||
None, sequence_length, batch_size
|
||||
) # I assume the attention mask is the same...
|
||||
|
||||
if attn2.group_norm is not None:
|
||||
cross_attention_hidden_states = attn2.group_norm(cross_attention_hidden_states.transpose(1, 2)).transpose(
|
||||
1, 2
|
||||
)
|
||||
|
||||
query2 = attn2.to_q(cross_attention_hidden_states)
|
||||
|
||||
if attn2.norm_cross:
|
||||
cross_attention_encoder_hidden_states = attn2.norm_encoder_hidden_states(
|
||||
cross_attention_encoder_hidden_states
|
||||
)
|
||||
|
||||
key2 = attn2.to_k(cross_attention_encoder_hidden_states)
|
||||
query2 = attn2.head_to_batch_dim(query2)
|
||||
key2 = attn2.head_to_batch_dim(key2)
|
||||
|
||||
cross_attention_probs = attn2.get_attention_scores(query2, key2, attention_mask)
|
||||
|
||||
# CrossAttention ends here, the remaining part is not used
|
||||
|
||||
# ================================================== #
|
||||
# ================ SELF ATTENTION 2 ================ #
|
||||
# ================================================== #
|
||||
# DEJA VU!
|
||||
|
||||
mask = (mask > 0.5).to(self_attention_output_hidden_states.dtype)
|
||||
m = mask.to(self_attention_output_hidden_states.device)
|
||||
# m = rearrange(m, 'b c h w -> b (h w) c').contiguous()
|
||||
m = m.permute(0, 2, 3, 1).reshape((m.shape[0], -1, m.shape[1])).contiguous() # B HW 1
|
||||
m = torch.matmul(m, m.permute(0, 2, 1)) + (1 - m)
|
||||
|
||||
# # Compute scaling coefficients for the similarity matrix
|
||||
# # Select the cross attention values for the correct tokens only!
|
||||
# cross_attention_probs = cross_attention_probs.mean(dim = 0)
|
||||
# cross_attention_probs = cross_attention_probs[:, self.token_idx].sum(dim=1)
|
||||
|
||||
# cross_attention_probs = cross_attention_probs.reshape(shape)
|
||||
# gaussian_smoothing = GaussianSmoothing(channels=1, kernel_size=3, sigma=0.5, dim=2).to(self_attention_output_hidden_states.device)
|
||||
# cross_attention_probs = gaussian_smoothing(cross_attention_probs.unsqueeze(0))[0] # optional smoothing
|
||||
# cross_attention_probs = cross_attention_probs.reshape(-1)
|
||||
# cross_attention_probs = ((cross_attention_probs - torch.median(cross_attention_probs.ravel())) / torch.max(cross_attention_probs.ravel())).clip(0, 1)
|
||||
|
||||
# c = (1 - m) * cross_attention_probs.reshape(1, 1, -1) + m # PAIntA scaling coefficients
|
||||
|
||||
# Compute scaling coefficients for the similarity matrix
|
||||
# Select the cross attention values for the correct tokens only!
|
||||
|
||||
batch_size, dims, channels = cross_attention_probs.shape
|
||||
batch_size = batch_size // attn.heads
|
||||
cross_attention_probs = cross_attention_probs.reshape((batch_size, attn.heads, dims, channels)) # B, D, HW, T
|
||||
|
||||
cross_attention_probs = cross_attention_probs.mean(dim=1) # B, HW, T
|
||||
cross_attention_probs = cross_attention_probs[..., self.token_idx].sum(dim=-1) # B, HW
|
||||
cross_attention_probs = cross_attention_probs.reshape((batch_size,) + shape) # , B, H, W
|
||||
|
||||
gaussian_smoothing = GaussianSmoothing(channels=1, kernel_size=3, sigma=0.5, dim=2).to(
|
||||
self_attention_output_hidden_states.device
|
||||
)
|
||||
cross_attention_probs = gaussian_smoothing(cross_attention_probs[:, None])[:, 0] # optional smoothing B, H, W
|
||||
|
||||
# Median normalization
|
||||
cross_attention_probs = cross_attention_probs.reshape(batch_size, -1) # B, HW
|
||||
cross_attention_probs = (
|
||||
cross_attention_probs - cross_attention_probs.median(dim=-1, keepdim=True).values
|
||||
) / cross_attention_probs.max(dim=-1, keepdim=True).values
|
||||
cross_attention_probs = cross_attention_probs.clip(0, 1)
|
||||
|
||||
c = (1 - m) * cross_attention_probs.reshape(batch_size, 1, -1) + m
|
||||
c = c.repeat_interleave(attn.heads, 0) # BD, HW
|
||||
if self.do_classifier_free_guidance:
|
||||
c = torch.cat([c, c]) # 2BD, HW
|
||||
|
||||
# Rescaling the original self-attention matrix
|
||||
self_attention_scores_rescaled = self_attention_scores * c
|
||||
self_attention_probs_rescaled = self_attention_scores_rescaled.softmax(dim=-1)
|
||||
|
||||
# Continuing the self attention normally using the new matrix
|
||||
hidden_states = torch.bmm(self_attention_probs_rescaled, value)
|
||||
hidden_states = attn.batch_to_head_dim(hidden_states)
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
if input_ndim == 4:
|
||||
hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
|
||||
|
||||
if attn.residual_connection:
|
||||
hidden_states = hidden_states + input_hidden_states
|
||||
|
||||
hidden_states = hidden_states / attn.rescale_output_factor
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline):
|
||||
def get_tokenized_prompt(self, prompt):
|
||||
out = self.tokenizer(prompt)
|
||||
return [self.tokenizer.decode(x) for x in out["input_ids"]]
|
||||
|
||||
def init_attn_processors(
|
||||
self,
|
||||
mask,
|
||||
token_idx,
|
||||
use_painta=True,
|
||||
use_rasg=True,
|
||||
painta_scale_factors=[2, 4], # 64x64 -> [16x16, 32x32]
|
||||
rasg_scale_factor=4, # 64x64 -> 16x16
|
||||
self_attention_layer_name="attn1",
|
||||
cross_attention_layer_name="attn2",
|
||||
list_of_painta_layer_names=None,
|
||||
list_of_rasg_layer_names=None,
|
||||
):
|
||||
default_processor = AttnProcessor()
|
||||
width, height = mask.shape[-2:]
|
||||
width, height = width // self.vae_scale_factor, height // self.vae_scale_factor
|
||||
|
||||
painta_scale_factors = [x * self.vae_scale_factor for x in painta_scale_factors]
|
||||
rasg_scale_factor = self.vae_scale_factor * rasg_scale_factor
|
||||
|
||||
attn_processors = {}
|
||||
for x in self.unet.attn_processors:
|
||||
if (list_of_painta_layer_names is None and self_attention_layer_name in x) or (
|
||||
list_of_painta_layer_names is not None and x in list_of_painta_layer_names
|
||||
):
|
||||
if use_painta:
|
||||
transformer_block = self.unet.get_submodule(x.replace(".attn1.processor", ""))
|
||||
attn_processors[x] = PAIntAAttnProcessor(
|
||||
transformer_block, mask, token_idx, self.do_classifier_free_guidance, painta_scale_factors
|
||||
)
|
||||
else:
|
||||
attn_processors[x] = default_processor
|
||||
elif (list_of_rasg_layer_names is None and cross_attention_layer_name in x) or (
|
||||
list_of_rasg_layer_names is not None and x in list_of_rasg_layer_names
|
||||
):
|
||||
if use_rasg:
|
||||
attn_processors[x] = RASGAttnProcessor(mask, token_idx, rasg_scale_factor)
|
||||
else:
|
||||
attn_processors[x] = default_processor
|
||||
|
||||
self.unet.set_attn_processor(attn_processors)
|
||||
# import json
|
||||
# with open('/home/hayk.manukyan/repos/diffusers/debug.txt', 'a') as f:
|
||||
# json.dump({x:str(y) for x,y in self.unet.attn_processors.items()}, f, indent=4)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]] = None,
|
||||
image: PipelineImageInput = None,
|
||||
mask_image: PipelineImageInput = None,
|
||||
masked_image_latents: torch.FloatTensor = None,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
padding_mask_crop: Optional[int] = None,
|
||||
strength: float = 1.0,
|
||||
num_inference_steps: int = 50,
|
||||
timesteps: List[int] = None,
|
||||
guidance_scale: float = 7.5,
|
||||
positive_prompt: Optional[str] = "",
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: float = 0.01,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
latents: Optional[torch.FloatTensor] = None,
|
||||
prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
ip_adapter_image: Optional[PipelineImageInput] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
clip_skip: int = None,
|
||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
use_painta=True,
|
||||
use_rasg=True,
|
||||
self_attention_layer_name=".attn1",
|
||||
cross_attention_layer_name=".attn2",
|
||||
painta_scale_factors=[2, 4], # 16 x 16 and 32 x 32
|
||||
rasg_scale_factor=4, # 16x16 by default
|
||||
list_of_painta_layer_names=None,
|
||||
list_of_rasg_layer_names=None,
|
||||
**kwargs,
|
||||
):
|
||||
callback = kwargs.pop("callback", None)
|
||||
callback_steps = kwargs.pop("callback_steps", None)
|
||||
|
||||
if callback is not None:
|
||||
deprecate(
|
||||
"callback",
|
||||
"1.0.0",
|
||||
"Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
||||
)
|
||||
if callback_steps is not None:
|
||||
deprecate(
|
||||
"callback_steps",
|
||||
"1.0.0",
|
||||
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
||||
)
|
||||
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
||||
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
||||
|
||||
#
|
||||
prompt_no_positives = prompt
|
||||
if isinstance(prompt, list):
|
||||
prompt = [x + positive_prompt for x in prompt]
|
||||
else:
|
||||
prompt = prompt + positive_prompt
|
||||
|
||||
# 1. Check inputs
|
||||
self.check_inputs(
|
||||
prompt,
|
||||
image,
|
||||
mask_image,
|
||||
height,
|
||||
width,
|
||||
strength,
|
||||
callback_steps,
|
||||
negative_prompt,
|
||||
prompt_embeds,
|
||||
negative_prompt_embeds,
|
||||
callback_on_step_end_tensor_inputs,
|
||||
padding_mask_crop,
|
||||
)
|
||||
|
||||
self._guidance_scale = guidance_scale
|
||||
self._clip_skip = clip_skip
|
||||
self._cross_attention_kwargs = cross_attention_kwargs
|
||||
self._interrupt = False
|
||||
|
||||
# 2. Define call parameters
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
|
||||
# assert batch_size == 1, "Does not work with batch size > 1 currently"
|
||||
|
||||
device = self._execution_device
|
||||
|
||||
# 3. Encode input prompt
|
||||
text_encoder_lora_scale = (
|
||||
cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
|
||||
)
|
||||
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
||||
prompt,
|
||||
device,
|
||||
num_images_per_prompt,
|
||||
self.do_classifier_free_guidance,
|
||||
negative_prompt,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_prompt_embeds,
|
||||
lora_scale=text_encoder_lora_scale,
|
||||
clip_skip=self.clip_skip,
|
||||
)
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
if self.do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
if ip_adapter_image is not None:
|
||||
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
|
||||
image_embeds, negative_image_embeds = self.encode_image(
|
||||
ip_adapter_image, device, num_images_per_prompt, output_hidden_state
|
||||
)
|
||||
if self.do_classifier_free_guidance:
|
||||
image_embeds = torch.cat([negative_image_embeds, image_embeds])
|
||||
|
||||
# 4. set timesteps
|
||||
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
|
||||
timesteps, num_inference_steps = self.get_timesteps(
|
||||
num_inference_steps=num_inference_steps, strength=strength, device=device
|
||||
)
|
||||
# check that number of inference steps is not < 1 - as this doesn't make sense
|
||||
if num_inference_steps < 1:
|
||||
raise ValueError(
|
||||
f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
|
||||
f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
|
||||
)
|
||||
# at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
|
||||
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
|
||||
# create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
|
||||
is_strength_max = strength == 1.0
|
||||
|
||||
# 5. Preprocess mask and image
|
||||
|
||||
if padding_mask_crop is not None:
|
||||
crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
|
||||
resize_mode = "fill"
|
||||
else:
|
||||
crops_coords = None
|
||||
resize_mode = "default"
|
||||
|
||||
original_image = image
|
||||
init_image = self.image_processor.preprocess(
|
||||
image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
|
||||
)
|
||||
init_image = init_image.to(dtype=torch.float32)
|
||||
|
||||
# 6. Prepare latent variables
|
||||
num_channels_latents = self.vae.config.latent_channels
|
||||
num_channels_unet = self.unet.config.in_channels
|
||||
return_image_latents = num_channels_unet == 4
|
||||
|
||||
latents_outputs = self.prepare_latents(
|
||||
batch_size * num_images_per_prompt,
|
||||
num_channels_latents,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds.dtype,
|
||||
device,
|
||||
generator,
|
||||
latents,
|
||||
image=init_image,
|
||||
timestep=latent_timestep,
|
||||
is_strength_max=is_strength_max,
|
||||
return_noise=True,
|
||||
return_image_latents=return_image_latents,
|
||||
)
|
||||
|
||||
if return_image_latents:
|
||||
latents, noise, image_latents = latents_outputs
|
||||
else:
|
||||
latents, noise = latents_outputs
|
||||
|
||||
# 7. Prepare mask latent variables
|
||||
mask_condition = self.mask_processor.preprocess(
|
||||
mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
|
||||
)
|
||||
|
||||
if masked_image_latents is None:
|
||||
masked_image = init_image * (mask_condition < 0.5)
|
||||
else:
|
||||
masked_image = masked_image_latents
|
||||
|
||||
mask, masked_image_latents = self.prepare_mask_latents(
|
||||
mask_condition,
|
||||
masked_image,
|
||||
batch_size * num_images_per_prompt,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds.dtype,
|
||||
device,
|
||||
generator,
|
||||
self.do_classifier_free_guidance,
|
||||
)
|
||||
|
||||
# 7.5 Setting up HD-Painter
|
||||
|
||||
# Get the indices of the tokens to be modified by both RASG and PAIntA
|
||||
token_idx = list(range(1, self.get_tokenized_prompt(prompt_no_positives).index("<|endoftext|>"))) + [
|
||||
self.get_tokenized_prompt(prompt).index("<|endoftext|>")
|
||||
]
|
||||
|
||||
# Setting up the attention processors
|
||||
self.init_attn_processors(
|
||||
mask_condition,
|
||||
token_idx,
|
||||
use_painta,
|
||||
use_rasg,
|
||||
painta_scale_factors=painta_scale_factors,
|
||||
rasg_scale_factor=rasg_scale_factor,
|
||||
self_attention_layer_name=self_attention_layer_name,
|
||||
cross_attention_layer_name=cross_attention_layer_name,
|
||||
list_of_painta_layer_names=list_of_painta_layer_names,
|
||||
list_of_rasg_layer_names=list_of_rasg_layer_names,
|
||||
)
|
||||
|
||||
# 8. Check that sizes of mask, masked image and latents match
|
||||
if num_channels_unet == 9:
|
||||
# default case for runwayml/stable-diffusion-inpainting
|
||||
num_channels_mask = mask.shape[1]
|
||||
num_channels_masked_image = masked_image_latents.shape[1]
|
||||
if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
|
||||
raise ValueError(
|
||||
f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
|
||||
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
|
||||
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
|
||||
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
|
||||
" `pipeline.unet` or your `mask_image` or `image` input."
|
||||
)
|
||||
elif num_channels_unet != 4:
|
||||
raise ValueError(
|
||||
f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
|
||||
)
|
||||
|
||||
# 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
if use_rasg:
|
||||
extra_step_kwargs["generator"] = None
|
||||
|
||||
# 9.1 Add image embeds for IP-Adapter
|
||||
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
|
||||
|
||||
# 9.2 Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
if self.unet.config.time_cond_proj_dim is not None:
|
||||
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
||||
timestep_cond = self.get_guidance_scale_embedding(
|
||||
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
).to(device=device, dtype=latents.dtype)
|
||||
|
||||
# 10. Denoising loop
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
self._num_timesteps = len(timesteps)
|
||||
painta_active = True
|
||||
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
if t < 500 and painta_active:
|
||||
self.init_attn_processors(
|
||||
mask_condition,
|
||||
token_idx,
|
||||
False,
|
||||
use_rasg,
|
||||
painta_scale_factors=painta_scale_factors,
|
||||
rasg_scale_factor=rasg_scale_factor,
|
||||
self_attention_layer_name=self_attention_layer_name,
|
||||
cross_attention_layer_name=cross_attention_layer_name,
|
||||
list_of_painta_layer_names=list_of_painta_layer_names,
|
||||
list_of_rasg_layer_names=list_of_rasg_layer_names,
|
||||
)
|
||||
painta_active = False
|
||||
|
||||
with torch.enable_grad():
|
||||
self.unet.zero_grad()
|
||||
latents = latents.detach()
|
||||
latents.requires_grad = True
|
||||
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
||||
|
||||
# concat latents, mask, masked_image_latents in the channel dimension
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
if num_channels_unet == 9:
|
||||
latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
|
||||
|
||||
self.scheduler.latents = latents
|
||||
self.encoder_hidden_states = prompt_embeds
|
||||
for attn_processor in self.unet.attn_processors.values():
|
||||
attn_processor.encoder_hidden_states = prompt_embeds
|
||||
|
||||
# predict the noise residual
|
||||
noise_pred = self.unet(
|
||||
latent_model_input,
|
||||
t,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep_cond=timestep_cond,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
added_cond_kwargs=added_cond_kwargs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
# perform guidance
|
||||
if self.do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
if use_rasg:
|
||||
# Perform RASG
|
||||
_, _, height, width = mask_condition.shape # 512 x 512
|
||||
scale_factor = self.vae_scale_factor * rasg_scale_factor # 8 * 4 = 32
|
||||
|
||||
# TODO: Fix for > 1 batch_size
|
||||
rasg_mask = F.interpolate(
|
||||
mask_condition, (height // scale_factor, width // scale_factor), mode="bicubic"
|
||||
)[0, 0] # mode is nearest by default, B, H, W
|
||||
|
||||
# Aggregate the saved attention maps
|
||||
attn_map = []
|
||||
for processor in self.unet.attn_processors.values():
|
||||
if hasattr(processor, "attention_scores") and processor.attention_scores is not None:
|
||||
if self.do_classifier_free_guidance:
|
||||
attn_map.append(processor.attention_scores.chunk(2)[1]) # (B/2) x H, 256, 77
|
||||
else:
|
||||
attn_map.append(processor.attention_scores) # B x H, 256, 77 ?
|
||||
|
||||
attn_map = (
|
||||
torch.cat(attn_map)
|
||||
.mean(0)
|
||||
.permute(1, 0)
|
||||
.reshape((-1, height // scale_factor, width // scale_factor))
|
||||
) # 77, 16, 16
|
||||
|
||||
# Compute the attention score
|
||||
attn_score = -sum(
|
||||
[
|
||||
F.binary_cross_entropy_with_logits(x - 1.0, rasg_mask.to(device))
|
||||
for x in attn_map[token_idx]
|
||||
]
|
||||
)
|
||||
|
||||
# Backward the score and compute the gradients
|
||||
attn_score.backward()
|
||||
|
||||
# Normalzie the gradients and compute the noise component
|
||||
variance_noise = latents.grad.detach()
|
||||
# print("VARIANCE SHAPE", variance_noise.shape)
|
||||
variance_noise -= torch.mean(variance_noise, [1, 2, 3], keepdim=True)
|
||||
variance_noise /= torch.std(variance_noise, [1, 2, 3], keepdim=True)
|
||||
else:
|
||||
variance_noise = None
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents = self.scheduler.step(
|
||||
noise_pred, t, latents, **extra_step_kwargs, return_dict=False, variance_noise=variance_noise
|
||||
)[0]
|
||||
|
||||
if num_channels_unet == 4:
|
||||
init_latents_proper = image_latents
|
||||
if self.do_classifier_free_guidance:
|
||||
init_mask, _ = mask.chunk(2)
|
||||
else:
|
||||
init_mask = mask
|
||||
|
||||
if i < len(timesteps) - 1:
|
||||
noise_timestep = timesteps[i + 1]
|
||||
init_latents_proper = self.scheduler.add_noise(
|
||||
init_latents_proper, noise, torch.tensor([noise_timestep])
|
||||
)
|
||||
|
||||
latents = (1 - init_mask) * init_latents_proper + init_mask * latents
|
||||
|
||||
if callback_on_step_end is not None:
|
||||
callback_kwargs = {}
|
||||
for k in callback_on_step_end_tensor_inputs:
|
||||
callback_kwargs[k] = locals()[k]
|
||||
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
||||
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
mask = callback_outputs.pop("mask", mask)
|
||||
masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
|
||||
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
step_idx = i // getattr(self.scheduler, "order", 1)
|
||||
callback(step_idx, t, latents)
|
||||
|
||||
if not output_type == "latent":
|
||||
condition_kwargs = {}
|
||||
if isinstance(self.vae, AsymmetricAutoencoderKL):
|
||||
init_image = init_image.to(device=device, dtype=masked_image_latents.dtype)
|
||||
init_image_condition = init_image.clone()
|
||||
init_image = self._encode_vae_image(init_image, generator=generator)
|
||||
mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
|
||||
condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
|
||||
image = self.vae.decode(
|
||||
latents / self.vae.config.scaling_factor, return_dict=False, generator=generator, **condition_kwargs
|
||||
)[0]
|
||||
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
||||
else:
|
||||
image = latents
|
||||
has_nsfw_concept = None
|
||||
|
||||
if has_nsfw_concept is None:
|
||||
do_denormalize = [True] * image.shape[0]
|
||||
else:
|
||||
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
|
||||
|
||||
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
|
||||
|
||||
if padding_mask_crop is not None:
|
||||
image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
|
||||
|
||||
# Offload all models
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
||||
|
||||
|
||||
# ============= Utility Functions ============== #
|
||||
|
||||
|
||||
class GaussianSmoothing(nn.Module):
|
||||
"""
|
||||
Apply gaussian smoothing on a
|
||||
1d, 2d or 3d tensor. Filtering is performed seperately for each channel
|
||||
in the input using a depthwise convolution.
|
||||
Arguments:
|
||||
channels (int, sequence): Number of channels of the input tensors. Output will
|
||||
have this number of channels as well.
|
||||
kernel_size (int, sequence): Size of the gaussian kernel.
|
||||
sigma (float, sequence): Standard deviation of the gaussian kernel.
|
||||
dim (int, optional): The number of dimensions of the data.
|
||||
Default value is 2 (spatial).
|
||||
"""
|
||||
|
||||
def __init__(self, channels, kernel_size, sigma, dim=2):
|
||||
super(GaussianSmoothing, self).__init__()
|
||||
if isinstance(kernel_size, numbers.Number):
|
||||
kernel_size = [kernel_size] * dim
|
||||
if isinstance(sigma, numbers.Number):
|
||||
sigma = [sigma] * dim
|
||||
|
||||
# The gaussian kernel is the product of the
|
||||
# gaussian function of each dimension.
|
||||
kernel = 1
|
||||
meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
|
||||
for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
|
||||
mean = (size - 1) / 2
|
||||
kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
|
||||
|
||||
# Make sure sum of values in gaussian kernel equals 1.
|
||||
kernel = kernel / torch.sum(kernel)
|
||||
|
||||
# Reshape to depthwise convolutional weight
|
||||
kernel = kernel.view(1, 1, *kernel.size())
|
||||
kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
|
||||
|
||||
self.register_buffer("weight", kernel)
|
||||
self.groups = channels
|
||||
|
||||
if dim == 1:
|
||||
self.conv = F.conv1d
|
||||
elif dim == 2:
|
||||
self.conv = F.conv2d
|
||||
elif dim == 3:
|
||||
self.conv = F.conv3d
|
||||
else:
|
||||
raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
|
||||
|
||||
def forward(self, input):
|
||||
"""
|
||||
Apply gaussian filter to input.
|
||||
Arguments:
|
||||
input (torch.Tensor): Input to apply gaussian filter on.
|
||||
Returns:
|
||||
filtered (torch.Tensor): Filtered output.
|
||||
"""
|
||||
return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups, padding="same")
|
||||
|
||||
|
||||
def get_attention_scores(
|
||||
self, query: torch.Tensor, key: torch.Tensor, attention_mask: torch.Tensor = None
|
||||
) -> torch.Tensor:
|
||||
r"""
|
||||
Compute the attention scores.
|
||||
|
||||
Args:
|
||||
query (`torch.Tensor`): The query tensor.
|
||||
key (`torch.Tensor`): The key tensor.
|
||||
attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: The attention probabilities/scores.
|
||||
"""
|
||||
if self.upcast_attention:
|
||||
query = query.float()
|
||||
key = key.float()
|
||||
|
||||
if attention_mask is None:
|
||||
baddbmm_input = torch.empty(
|
||||
query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
|
||||
)
|
||||
beta = 0
|
||||
else:
|
||||
baddbmm_input = attention_mask
|
||||
beta = 1
|
||||
|
||||
attention_scores = torch.baddbmm(
|
||||
baddbmm_input,
|
||||
query,
|
||||
key.transpose(-1, -2),
|
||||
beta=beta,
|
||||
alpha=self.scale,
|
||||
)
|
||||
del baddbmm_input
|
||||
|
||||
if self.upcast_softmax:
|
||||
attention_scores = attention_scores.float()
|
||||
|
||||
return attention_scores
|
||||
@@ -1,7 +1,8 @@
|
||||
"""
|
||||
modeled after the textual_inversion.py / train_dreambooth.py and the work
|
||||
of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
|
||||
modeled after the textual_inversion.py / train_dreambooth.py and the work
|
||||
of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
|
||||
"""
|
||||
|
||||
import inspect
|
||||
import warnings
|
||||
from typing import List, Optional, Union
|
||||
|
||||
@@ -468,7 +468,12 @@ class InstaFlowPipeline(
|
||||
)
|
||||
|
||||
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
|
||||
@@ -26,7 +26,14 @@ from diffusers.configuration_utils import FrozenDict
|
||||
from diffusers.image_processor import VaeImageProcessor
|
||||
from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.models.lora import LoRALinearLayer, adjust_lora_scale_text_encoder
|
||||
from diffusers.models.attention_processor import (
|
||||
AttnProcessor,
|
||||
AttnProcessor2_0,
|
||||
IPAdapterAttnProcessor,
|
||||
IPAdapterAttnProcessor2_0,
|
||||
)
|
||||
from diffusers.models.embeddings import MultiIPAdapterImageProjection
|
||||
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
@@ -45,300 +52,6 @@ from diffusers.utils.torch_utils import randn_tensor
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
class LoRAIPAdapterAttnProcessor(nn.Module):
|
||||
r"""
|
||||
Attention processor for IP-Adapater.
|
||||
Args:
|
||||
hidden_size (`int`):
|
||||
The hidden size of the attention layer.
|
||||
cross_attention_dim (`int`):
|
||||
The number of channels in the `encoder_hidden_states`.
|
||||
rank (`int`, defaults to 4):
|
||||
The dimension of the LoRA update matrices.
|
||||
network_alpha (`int`, *optional*):
|
||||
Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
|
||||
lora_scale (`float`, defaults to 1.0):
|
||||
the weight scale of LoRA.
|
||||
scale (`float`, defaults to 1.0):
|
||||
the weight scale of image prompt.
|
||||
num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
|
||||
The context length of the image features.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size,
|
||||
cross_attention_dim=None,
|
||||
rank=4,
|
||||
network_alpha=None,
|
||||
lora_scale=1.0,
|
||||
scale=1.0,
|
||||
num_tokens=4,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.rank = rank
|
||||
self.lora_scale = lora_scale
|
||||
|
||||
self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
|
||||
self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
|
||||
self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
|
||||
self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.cross_attention_dim = cross_attention_dim
|
||||
self.scale = scale
|
||||
self.num_tokens = num_tokens
|
||||
|
||||
self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
|
||||
self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
attn,
|
||||
hidden_states,
|
||||
encoder_hidden_states=None,
|
||||
attention_mask=None,
|
||||
temb=None,
|
||||
):
|
||||
residual = hidden_states
|
||||
|
||||
# separate ip_hidden_states from encoder_hidden_states
|
||||
if encoder_hidden_states is not None:
|
||||
if isinstance(encoder_hidden_states, tuple):
|
||||
encoder_hidden_states, ip_hidden_states = encoder_hidden_states
|
||||
else:
|
||||
deprecation_message = (
|
||||
"You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release."
|
||||
" Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning."
|
||||
)
|
||||
deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
|
||||
end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
|
||||
encoder_hidden_states, ip_hidden_states = (
|
||||
encoder_hidden_states[:, :end_pos, :],
|
||||
[encoder_hidden_states[:, end_pos:, :]],
|
||||
)
|
||||
|
||||
if attn.spatial_norm is not None:
|
||||
hidden_states = attn.spatial_norm(hidden_states, temb)
|
||||
|
||||
input_ndim = hidden_states.ndim
|
||||
|
||||
if input_ndim == 4:
|
||||
batch_size, channel, height, width = hidden_states.shape
|
||||
hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
|
||||
|
||||
batch_size, sequence_length, _ = (
|
||||
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
|
||||
)
|
||||
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
|
||||
|
||||
if attn.group_norm is not None:
|
||||
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
|
||||
|
||||
query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
|
||||
|
||||
if encoder_hidden_states is None:
|
||||
encoder_hidden_states = hidden_states
|
||||
elif attn.norm_cross:
|
||||
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
|
||||
|
||||
key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
|
||||
value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
|
||||
|
||||
query = attn.head_to_batch_dim(query)
|
||||
key = attn.head_to_batch_dim(key)
|
||||
value = attn.head_to_batch_dim(value)
|
||||
|
||||
attention_probs = attn.get_attention_scores(query, key, attention_mask)
|
||||
hidden_states = torch.bmm(attention_probs, value)
|
||||
hidden_states = attn.batch_to_head_dim(hidden_states)
|
||||
|
||||
# for ip-adapter
|
||||
ip_key = self.to_k_ip(ip_hidden_states)
|
||||
ip_value = self.to_v_ip(ip_hidden_states)
|
||||
|
||||
ip_key = attn.head_to_batch_dim(ip_key)
|
||||
ip_value = attn.head_to_batch_dim(ip_value)
|
||||
|
||||
ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
|
||||
ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
|
||||
ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
|
||||
|
||||
hidden_states = hidden_states + self.scale * ip_hidden_states
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
if input_ndim == 4:
|
||||
hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
|
||||
|
||||
if attn.residual_connection:
|
||||
hidden_states = hidden_states + residual
|
||||
|
||||
hidden_states = hidden_states / attn.rescale_output_factor
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class LoRAIPAdapterAttnProcessor2_0(nn.Module):
|
||||
r"""
|
||||
Attention processor for IP-Adapater for PyTorch 2.0.
|
||||
Args:
|
||||
hidden_size (`int`):
|
||||
The hidden size of the attention layer.
|
||||
cross_attention_dim (`int`):
|
||||
The number of channels in the `encoder_hidden_states`.
|
||||
rank (`int`, defaults to 4):
|
||||
The dimension of the LoRA update matrices.
|
||||
network_alpha (`int`, *optional*):
|
||||
Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
|
||||
lora_scale (`float`, defaults to 1.0):
|
||||
the weight scale of LoRA.
|
||||
scale (`float`, defaults to 1.0):
|
||||
the weight scale of image prompt.
|
||||
num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
|
||||
The context length of the image features.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size,
|
||||
cross_attention_dim=None,
|
||||
rank=4,
|
||||
network_alpha=None,
|
||||
lora_scale=1.0,
|
||||
scale=1.0,
|
||||
num_tokens=4,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.rank = rank
|
||||
self.lora_scale = lora_scale
|
||||
|
||||
self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
|
||||
self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
|
||||
self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
|
||||
self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.cross_attention_dim = cross_attention_dim
|
||||
self.scale = scale
|
||||
self.num_tokens = num_tokens
|
||||
|
||||
self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
|
||||
self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
attn,
|
||||
hidden_states,
|
||||
encoder_hidden_states=None,
|
||||
attention_mask=None,
|
||||
temb=None,
|
||||
):
|
||||
residual = hidden_states
|
||||
|
||||
# separate ip_hidden_states from encoder_hidden_states
|
||||
if encoder_hidden_states is not None:
|
||||
if isinstance(encoder_hidden_states, tuple):
|
||||
encoder_hidden_states, ip_hidden_states = encoder_hidden_states
|
||||
else:
|
||||
deprecation_message = (
|
||||
"You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release."
|
||||
" Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning."
|
||||
)
|
||||
deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
|
||||
end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
|
||||
encoder_hidden_states, ip_hidden_states = (
|
||||
encoder_hidden_states[:, :end_pos, :],
|
||||
[encoder_hidden_states[:, end_pos:, :]],
|
||||
)
|
||||
|
||||
if attn.spatial_norm is not None:
|
||||
hidden_states = attn.spatial_norm(hidden_states, temb)
|
||||
|
||||
input_ndim = hidden_states.ndim
|
||||
|
||||
if input_ndim == 4:
|
||||
batch_size, channel, height, width = hidden_states.shape
|
||||
hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
|
||||
|
||||
batch_size, sequence_length, _ = (
|
||||
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
|
||||
)
|
||||
|
||||
if attention_mask is not None:
|
||||
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
|
||||
# scaled_dot_product_attention expects attention_mask shape to be
|
||||
# (batch, heads, source_length, target_length)
|
||||
attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
|
||||
|
||||
if attn.group_norm is not None:
|
||||
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
|
||||
|
||||
query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
|
||||
|
||||
if encoder_hidden_states is None:
|
||||
encoder_hidden_states = hidden_states
|
||||
elif attn.norm_cross:
|
||||
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
|
||||
|
||||
key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
|
||||
value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
|
||||
|
||||
inner_dim = key.shape[-1]
|
||||
head_dim = inner_dim // attn.heads
|
||||
|
||||
query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
||||
key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
||||
value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
||||
|
||||
# the output of sdp = (batch, num_heads, seq_len, head_dim)
|
||||
# TODO: add support for attn.scale when we move to Torch 2.1
|
||||
hidden_states = F.scaled_dot_product_attention(
|
||||
query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
|
||||
)
|
||||
|
||||
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
|
||||
hidden_states = hidden_states.to(query.dtype)
|
||||
|
||||
# for ip-adapter
|
||||
ip_key = self.to_k_ip(ip_hidden_states)
|
||||
ip_value = self.to_v_ip(ip_hidden_states)
|
||||
|
||||
ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
||||
ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
||||
|
||||
# the output of sdp = (batch, num_heads, seq_len, head_dim)
|
||||
# TODO: add support for attn.scale when we move to Torch 2.1
|
||||
ip_hidden_states = F.scaled_dot_product_attention(
|
||||
query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
|
||||
)
|
||||
|
||||
ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
|
||||
ip_hidden_states = ip_hidden_states.to(query.dtype)
|
||||
|
||||
hidden_states = hidden_states + self.scale * ip_hidden_states
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
if input_ndim == 4:
|
||||
hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
|
||||
|
||||
if attn.residual_connection:
|
||||
hidden_states = hidden_states + residual
|
||||
|
||||
hidden_states = hidden_states / attn.rescale_output_factor
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class IPAdapterFullImageProjection(nn.Module):
|
||||
def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_tokens=1):
|
||||
super().__init__()
|
||||
@@ -615,17 +328,13 @@ class IPAdapterFaceIDStableDiffusionPipeline(
|
||||
return image_projection
|
||||
|
||||
def _load_ip_adapter_weights(self, state_dict):
|
||||
from diffusers.models.attention_processor import (
|
||||
AttnProcessor,
|
||||
AttnProcessor2_0,
|
||||
)
|
||||
|
||||
num_image_text_embeds = 4
|
||||
|
||||
self.unet.encoder_hid_proj = None
|
||||
|
||||
# set ip-adapter cross-attention processors & load state_dict
|
||||
attn_procs = {}
|
||||
lora_dict = {}
|
||||
key_id = 0
|
||||
for name in self.unet.attn_processors.keys():
|
||||
cross_attention_dim = None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim
|
||||
@@ -642,94 +351,99 @@ class IPAdapterFaceIDStableDiffusionPipeline(
|
||||
AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
|
||||
)
|
||||
attn_procs[name] = attn_processor_class()
|
||||
rank = state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"].shape[0]
|
||||
attn_module = self.unet
|
||||
for n in name.split(".")[:-1]:
|
||||
attn_module = getattr(attn_module, n)
|
||||
# Set the `lora_layer` attribute of the attention-related matrices.
|
||||
attn_module.to_q.set_lora_layer(
|
||||
LoRALinearLayer(
|
||||
in_features=attn_module.to_q.in_features,
|
||||
out_features=attn_module.to_q.out_features,
|
||||
rank=rank,
|
||||
)
|
||||
)
|
||||
attn_module.to_k.set_lora_layer(
|
||||
LoRALinearLayer(
|
||||
in_features=attn_module.to_k.in_features,
|
||||
out_features=attn_module.to_k.out_features,
|
||||
rank=rank,
|
||||
)
|
||||
)
|
||||
attn_module.to_v.set_lora_layer(
|
||||
LoRALinearLayer(
|
||||
in_features=attn_module.to_v.in_features,
|
||||
out_features=attn_module.to_v.out_features,
|
||||
rank=rank,
|
||||
)
|
||||
)
|
||||
attn_module.to_out[0].set_lora_layer(
|
||||
LoRALinearLayer(
|
||||
in_features=attn_module.to_out[0].in_features,
|
||||
out_features=attn_module.to_out[0].out_features,
|
||||
rank=rank,
|
||||
)
|
||||
)
|
||||
|
||||
value_dict = {}
|
||||
for k, module in attn_module.named_children():
|
||||
index = "."
|
||||
if not hasattr(module, "set_lora_layer"):
|
||||
index = ".0."
|
||||
module = module[0]
|
||||
lora_layer = getattr(module, "lora_layer")
|
||||
for lora_name, w in lora_layer.state_dict().items():
|
||||
value_dict.update(
|
||||
{
|
||||
f"{k}{index}lora_layer.{lora_name}": state_dict["ip_adapter"][
|
||||
f"{key_id}.{k}_lora.{lora_name}"
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
attn_module.load_state_dict(value_dict, strict=False)
|
||||
attn_module.to(dtype=self.dtype, device=self.device)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_k_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.down.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_q_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_v_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.down.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{
|
||||
f"unet.{name}.to_out_lora.down.weight": state_dict["ip_adapter"][
|
||||
f"{key_id}.to_out_lora.down.weight"
|
||||
]
|
||||
}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_k_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.up.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_q_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.up.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_v_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.up.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_out_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_out_lora.up.weight"]}
|
||||
)
|
||||
key_id += 1
|
||||
else:
|
||||
rank = state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"].shape[0]
|
||||
attn_processor_class = (
|
||||
LoRAIPAdapterAttnProcessor2_0
|
||||
if hasattr(F, "scaled_dot_product_attention")
|
||||
else LoRAIPAdapterAttnProcessor
|
||||
IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
|
||||
)
|
||||
attn_procs[name] = attn_processor_class(
|
||||
hidden_size=hidden_size,
|
||||
cross_attention_dim=cross_attention_dim,
|
||||
scale=1.0,
|
||||
rank=rank,
|
||||
num_tokens=num_image_text_embeds,
|
||||
).to(dtype=self.dtype, device=self.device)
|
||||
|
||||
value_dict = {}
|
||||
for k, w in attn_procs[name].state_dict().items():
|
||||
value_dict.update({f"{k}": state_dict["ip_adapter"][f"{key_id}.{k}"]})
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_k_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.down.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_q_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_v_lora.down.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.down.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{
|
||||
f"unet.{name}.to_out_lora.down.weight": state_dict["ip_adapter"][
|
||||
f"{key_id}.to_out_lora.down.weight"
|
||||
]
|
||||
}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_k_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_k_lora.up.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_q_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_q_lora.up.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_v_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_v_lora.up.weight"]}
|
||||
)
|
||||
lora_dict.update(
|
||||
{f"unet.{name}.to_out_lora.up.weight": state_dict["ip_adapter"][f"{key_id}.to_out_lora.up.weight"]}
|
||||
)
|
||||
|
||||
value_dict = {}
|
||||
value_dict.update({"to_k_ip.0.weight": state_dict["ip_adapter"][f"{key_id}.to_k_ip.weight"]})
|
||||
value_dict.update({"to_v_ip.0.weight": state_dict["ip_adapter"][f"{key_id}.to_v_ip.weight"]})
|
||||
attn_procs[name].load_state_dict(value_dict)
|
||||
key_id += 1
|
||||
|
||||
self.unet.set_attn_processor(attn_procs)
|
||||
|
||||
self.load_lora_weights(lora_dict, adapter_name="faceid")
|
||||
self.set_adapters(["faceid"], adapter_weights=[1.0])
|
||||
|
||||
# convert IP-Adapter Image Projection layers to diffusers
|
||||
image_projection = self.convert_ip_adapter_image_proj_to_diffusers(state_dict["image_proj"])
|
||||
image_projection_layers = [image_projection.to(device=self.device, dtype=self.dtype)]
|
||||
|
||||
self.unet.encoder_hid_proj = image_projection.to(device=self.device, dtype=self.dtype)
|
||||
self.unet.encoder_hid_proj = MultiIPAdapterImageProjection(image_projection_layers)
|
||||
self.unet.config.encoder_hid_dim_type = "ip_image_proj"
|
||||
|
||||
def set_ip_adapter_scale(self, scale):
|
||||
unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
|
||||
for attn_processor in unet.attn_processors.values():
|
||||
if isinstance(attn_processor, (LoRAIPAdapterAttnProcessor, LoRAIPAdapterAttnProcessor2_0)):
|
||||
attn_processor.scale = scale
|
||||
if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
|
||||
attn_processor.scale = [scale]
|
||||
|
||||
def _encode_prompt(
|
||||
self,
|
||||
@@ -1039,7 +753,12 @@ class IPAdapterFaceIDStableDiffusionPipeline(
|
||||
)
|
||||
|
||||
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
@@ -1298,7 +1017,7 @@ class IPAdapterFaceIDStableDiffusionPipeline(
|
||||
negative_image_embeds = torch.zeros_like(image_embeds)
|
||||
if self.do_classifier_free_guidance:
|
||||
image_embeds = torch.cat([negative_image_embeds, image_embeds])
|
||||
|
||||
image_embeds = [image_embeds]
|
||||
# 4. Prepare timesteps
|
||||
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
|
||||
|
||||
@@ -1319,7 +1038,7 @@ class IPAdapterFaceIDStableDiffusionPipeline(
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
# 6.1 Add image embeds for IP-Adapter
|
||||
added_cond_kwargs = {"image_embeds": image_embeds} if image_embeds is not None else None
|
||||
added_cond_kwargs = {"image_embeds": image_embeds} if image_embeds is not None else {}
|
||||
|
||||
# 6.2 Optionally get Guidance Scale Embedding
|
||||
timestep_cond = None
|
||||
|
||||
@@ -177,7 +177,12 @@ class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline):
|
||||
latents=None,
|
||||
generator=None,
|
||||
):
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
|
||||
if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
|
||||
raise ValueError(
|
||||
@@ -330,17 +335,18 @@ class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline):
|
||||
|
||||
# 5. Prepare latent variable
|
||||
num_channels_latents = self.unet.config.in_channels
|
||||
latents = self.prepare_latents(
|
||||
image,
|
||||
latent_timestep,
|
||||
batch_size * num_images_per_prompt,
|
||||
num_channels_latents,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds.dtype,
|
||||
device,
|
||||
latents,
|
||||
)
|
||||
if latents is None:
|
||||
latents = self.prepare_latents(
|
||||
image,
|
||||
latent_timestep,
|
||||
batch_size * num_images_per_prompt,
|
||||
num_channels_latents,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds.dtype,
|
||||
device,
|
||||
latents,
|
||||
)
|
||||
bs = batch_size * num_images_per_prompt
|
||||
|
||||
# 6. Get Guidance Scale Embedding
|
||||
@@ -440,7 +446,7 @@ def betas_for_alpha_bar(
|
||||
return math.exp(t * -12.0)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
|
||||
raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
|
||||
|
||||
betas = []
|
||||
for i in range(num_diffusion_timesteps):
|
||||
|
||||
@@ -472,7 +472,12 @@ class LatentConsistencyModelWalkPipeline(
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
|
||||
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
@@ -726,7 +731,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
embedding_interpolation_type (`str`, *optional*, defaults to `"lerp"`):
|
||||
The type of interpolation to use for interpolating between text embeddings. Choose between `"lerp"` and `"slerp"`.
|
||||
latent_interpolation_type (`str`, *optional*, defaults to `"slerp"`):
|
||||
@@ -779,7 +784,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
else:
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
if batch_size < 2:
|
||||
raise ValueError(f"`prompt` must have length of atleast 2 but found {batch_size}")
|
||||
raise ValueError(f"`prompt` must have length of at least 2 but found {batch_size}")
|
||||
if num_images_per_prompt != 1:
|
||||
raise ValueError("`num_images_per_prompt` must be `1` as no other value is supported yet")
|
||||
if prompt_embeds is not None:
|
||||
@@ -883,7 +888,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
) as batch_progress_bar:
|
||||
for batch_index in range(0, bs, process_batch_size):
|
||||
batch_inference_latents = inference_latents[batch_index : batch_index + process_batch_size]
|
||||
batch_inference_embedddings = inference_embeddings[
|
||||
batch_inference_embeddings = inference_embeddings[
|
||||
batch_index : batch_index + process_batch_size
|
||||
]
|
||||
|
||||
@@ -892,7 +897,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
)
|
||||
timesteps = self.scheduler.timesteps
|
||||
|
||||
current_bs = batch_inference_embedddings.shape[0]
|
||||
current_bs = batch_inference_embeddings.shape[0]
|
||||
w = torch.tensor(self.guidance_scale - 1).repeat(current_bs)
|
||||
w_embedding = self.get_guidance_scale_embedding(
|
||||
w, embedding_dim=self.unet.config.time_cond_proj_dim
|
||||
@@ -901,14 +906,14 @@ class LatentConsistencyModelWalkPipeline(
|
||||
# 10. Perform inference for current batch
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for index, t in enumerate(timesteps):
|
||||
batch_inference_latents = batch_inference_latents.to(batch_inference_embedddings.dtype)
|
||||
batch_inference_latents = batch_inference_latents.to(batch_inference_embeddings.dtype)
|
||||
|
||||
# model prediction (v-prediction, eps, x)
|
||||
model_pred = self.unet(
|
||||
batch_inference_latents,
|
||||
t,
|
||||
timestep_cond=w_embedding,
|
||||
encoder_hidden_states=batch_inference_embedddings,
|
||||
encoder_hidden_states=batch_inference_embeddings,
|
||||
cross_attention_kwargs=self.cross_attention_kwargs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
@@ -924,8 +929,8 @@ class LatentConsistencyModelWalkPipeline(
|
||||
callback_outputs = callback_on_step_end(self, index, t, callback_kwargs)
|
||||
|
||||
batch_inference_latents = callback_outputs.pop("latents", batch_inference_latents)
|
||||
batch_inference_embedddings = callback_outputs.pop(
|
||||
"prompt_embeds", batch_inference_embedddings
|
||||
batch_inference_embeddings = callback_outputs.pop(
|
||||
"prompt_embeds", batch_inference_embeddings
|
||||
)
|
||||
w_embedding = callback_outputs.pop("w_embedding", w_embedding)
|
||||
denoised = callback_outputs.pop("denoised", denoised)
|
||||
@@ -939,7 +944,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
step_idx = index // getattr(self.scheduler, "order", 1)
|
||||
callback(step_idx, t, batch_inference_latents)
|
||||
|
||||
denoised = denoised.to(batch_inference_embedddings.dtype)
|
||||
denoised = denoised.to(batch_inference_embeddings.dtype)
|
||||
|
||||
# Note: This is not supported because you would get black images in your latent walk if
|
||||
# NSFW concept is detected
|
||||
|
||||
@@ -163,7 +163,12 @@ class LatentConsistencyModelPipeline(DiffusionPipeline):
|
||||
return image, has_nsfw_concept
|
||||
|
||||
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, latents=None):
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if latents is None:
|
||||
latents = torch.randn(shape, dtype=dtype).to(device)
|
||||
else:
|
||||
@@ -348,7 +353,7 @@ def betas_for_alpha_bar(
|
||||
return math.exp(t * -12.0)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
|
||||
raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
|
||||
|
||||
betas = []
|
||||
for i in range(num_diffusion_timesteps):
|
||||
|
||||
@@ -530,7 +530,7 @@ class LLMGroundedDiffusionPipeline(
|
||||
)
|
||||
|
||||
if len(phrases) != len(boxes):
|
||||
ValueError(
|
||||
raise ValueError(
|
||||
"length of `phrases` and `boxes` has to be same, but"
|
||||
f" got: `phrases` {len(phrases)} != `boxes` {len(boxes)}"
|
||||
)
|
||||
|
||||
@@ -439,7 +439,9 @@ class StableDiffusionLongPromptWeightingPipeline(
|
||||
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
|
||||
"""
|
||||
|
||||
model_cpu_offload_seq = "text_encoder-->unet->vae"
|
||||
_optional_components = ["safety_checker", "feature_extractor"]
|
||||
_exclude_from_cpu_offload = ["safety_checker"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -724,7 +726,12 @@ class StableDiffusionLongPromptWeightingPipeline(
|
||||
):
|
||||
if image is None:
|
||||
batch_size = batch_size * num_images_per_prompt
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
|
||||
@@ -164,7 +164,7 @@ def get_prompts_tokens_with_weights(clip_tokenizer: CLIPTokenizer, prompt: str):
|
||||
text_tokens (list)
|
||||
A list contains token ids
|
||||
text_weight (list)
|
||||
A list contains the correspodent weight of token ids
|
||||
A list contains the correspondent weight of token ids
|
||||
|
||||
Example:
|
||||
import torch
|
||||
@@ -1028,7 +1028,7 @@ class SDXLLongPromptWeightingPipeline(
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
# (except the highest one) is duplicated. If `num_inference_steps` is even it would
|
||||
# mean that we cut the timesteps in the middle of the denoising step
|
||||
# (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
|
||||
# (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
|
||||
# we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
@@ -1060,7 +1060,12 @@ class SDXLLongPromptWeightingPipeline(
|
||||
batch_size *= num_images_per_prompt
|
||||
|
||||
if image is None:
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
@@ -1140,7 +1145,12 @@ class SDXLLongPromptWeightingPipeline(
|
||||
return latents
|
||||
|
||||
else:
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
@@ -1531,7 +1541,7 @@ class SDXLLongPromptWeightingPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -2131,7 +2141,7 @@ class SDXLLongPromptWeightingPipeline(
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Overrride to properly handle the loading and unloading of the additional text encoder.
|
||||
# Override to properly handle the loading and unloading of the additional text encoder.
|
||||
def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
|
||||
# We could have accessed the unet config from `lora_state_dict()` too. We pass
|
||||
# it here explicitly to be able to tell that it's coming from an SDXL
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
import logging
|
||||
import math
|
||||
from typing import Dict, Union
|
||||
|
||||
@@ -25,6 +26,7 @@ import matplotlib
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from PIL.Image import Resampling
|
||||
from scipy.optimize import minimize
|
||||
from torch.utils.data import DataLoader, TensorDataset
|
||||
from tqdm.auto import tqdm
|
||||
@@ -34,13 +36,14 @@ from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
DiffusionPipeline,
|
||||
LCMScheduler,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils import BaseOutput, check_min_version
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.27.0.dev0")
|
||||
check_min_version("0.25.0")
|
||||
|
||||
|
||||
class MarigoldDepthOutput(BaseOutput):
|
||||
@@ -61,6 +64,19 @@ class MarigoldDepthOutput(BaseOutput):
|
||||
uncertainty: Union[None, np.ndarray]
|
||||
|
||||
|
||||
def get_pil_resample_method(method_str: str) -> Resampling:
|
||||
resample_method_dic = {
|
||||
"bilinear": Resampling.BILINEAR,
|
||||
"bicubic": Resampling.BICUBIC,
|
||||
"nearest": Resampling.NEAREST,
|
||||
}
|
||||
resample_method = resample_method_dic.get(method_str, None)
|
||||
if resample_method is None:
|
||||
raise ValueError(f"Unknown resampling method: {resample_method}")
|
||||
else:
|
||||
return resample_method
|
||||
|
||||
|
||||
class MarigoldPipeline(DiffusionPipeline):
|
||||
"""
|
||||
Pipeline for monocular depth estimation using Marigold: https://marigoldmonodepth.github.io.
|
||||
@@ -113,7 +129,9 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
ensemble_size: int = 10,
|
||||
processing_res: int = 768,
|
||||
match_input_res: bool = True,
|
||||
resample_method: str = "bilinear",
|
||||
batch_size: int = 0,
|
||||
seed: Union[int, None] = None,
|
||||
color_map: str = "Spectral",
|
||||
show_progress_bar: bool = True,
|
||||
ensemble_kwargs: Dict = None,
|
||||
@@ -129,7 +147,9 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
If set to 0: will not resize at all.
|
||||
match_input_res (`bool`, *optional*, defaults to `True`):
|
||||
Resize depth prediction to match input resolution.
|
||||
Only valid if `limit_input_res` is not None.
|
||||
Only valid if `processing_res` > 0.
|
||||
resample_method: (`str`, *optional*, defaults to `bilinear`):
|
||||
Resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`.
|
||||
denoising_steps (`int`, *optional*, defaults to `10`):
|
||||
Number of diffusion denoising steps (DDIM) during inference.
|
||||
ensemble_size (`int`, *optional*, defaults to `10`):
|
||||
@@ -137,6 +157,8 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
batch_size (`int`, *optional*, defaults to `0`):
|
||||
Inference batch size, no bigger than `num_ensemble`.
|
||||
If set to 0, the script will automatically decide the proper batch size.
|
||||
seed (`int`, *optional*, defaults to `None`)
|
||||
Reproducibility seed.
|
||||
show_progress_bar (`bool`, *optional*, defaults to `True`):
|
||||
Display a progress bar of diffusion denoising.
|
||||
color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized depth map generation):
|
||||
@@ -146,8 +168,7 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
Returns:
|
||||
`MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
|
||||
- **depth_np** (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1]
|
||||
- **depth_colored** (`None` or `PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and
|
||||
values in [0, 1]. None if `color_map` is `None`
|
||||
- **depth_colored** (`PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and values in [0, 1], None if `color_map` is `None`
|
||||
- **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
|
||||
coming from ensembling. None if `ensemble_size = 1`
|
||||
"""
|
||||
@@ -158,13 +179,21 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
if not match_input_res:
|
||||
assert processing_res is not None, "Value error: `resize_output_back` is only valid with "
|
||||
assert processing_res >= 0
|
||||
assert denoising_steps >= 1
|
||||
assert ensemble_size >= 1
|
||||
|
||||
# Check if denoising step is reasonable
|
||||
self._check_inference_step(denoising_steps)
|
||||
|
||||
resample_method: Resampling = get_pil_resample_method(resample_method)
|
||||
|
||||
# ----------------- Image Preprocess -----------------
|
||||
# Resize image
|
||||
if processing_res > 0:
|
||||
input_image = self.resize_max_res(input_image, max_edge_resolution=processing_res)
|
||||
input_image = self.resize_max_res(
|
||||
input_image,
|
||||
max_edge_resolution=processing_res,
|
||||
resample_method=resample_method,
|
||||
)
|
||||
# Convert the image to RGB, to 1.remove the alpha channel 2.convert B&W to 3-channel
|
||||
input_image = input_image.convert("RGB")
|
||||
image = np.asarray(input_image)
|
||||
@@ -203,9 +232,10 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
rgb_in=batched_img,
|
||||
num_inference_steps=denoising_steps,
|
||||
show_pbar=show_progress_bar,
|
||||
seed=seed,
|
||||
)
|
||||
depth_pred_ls.append(depth_pred_raw.detach().clone())
|
||||
depth_preds = torch.concat(depth_pred_ls, axis=0).squeeze()
|
||||
depth_pred_ls.append(depth_pred_raw.detach())
|
||||
depth_preds = torch.concat(depth_pred_ls, dim=0).squeeze()
|
||||
torch.cuda.empty_cache() # clear vram cache for ensembling
|
||||
|
||||
# ----------------- Test-time ensembling -----------------
|
||||
@@ -227,7 +257,7 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
# Resize back to original resolution
|
||||
if match_input_res:
|
||||
pred_img = Image.fromarray(depth_pred)
|
||||
pred_img = pred_img.resize(input_size)
|
||||
pred_img = pred_img.resize(input_size, resample=resample_method)
|
||||
depth_pred = np.asarray(pred_img)
|
||||
|
||||
# Clip output range
|
||||
@@ -243,12 +273,32 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
depth_colored_img = Image.fromarray(depth_colored_hwc)
|
||||
else:
|
||||
depth_colored_img = None
|
||||
|
||||
return MarigoldDepthOutput(
|
||||
depth_np=depth_pred,
|
||||
depth_colored=depth_colored_img,
|
||||
uncertainty=pred_uncert,
|
||||
)
|
||||
|
||||
def _check_inference_step(self, n_step: int):
|
||||
"""
|
||||
Check if denoising step is reasonable
|
||||
Args:
|
||||
n_step (`int`): denoising steps
|
||||
"""
|
||||
assert n_step >= 1
|
||||
|
||||
if isinstance(self.scheduler, DDIMScheduler):
|
||||
if n_step < 10:
|
||||
logging.warning(
|
||||
f"Too few denoising steps: {n_step}. Recommended to use the LCM checkpoint for few-step inference."
|
||||
)
|
||||
elif isinstance(self.scheduler, LCMScheduler):
|
||||
if not 1 <= n_step <= 4:
|
||||
logging.warning(f"Non-optimal setting of denoising steps: {n_step}. Recommended setting is 1-4 steps.")
|
||||
else:
|
||||
raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
|
||||
|
||||
def _encode_empty_text(self):
|
||||
"""
|
||||
Encode text embedding for empty prompt.
|
||||
@@ -265,7 +315,13 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)
|
||||
|
||||
@torch.no_grad()
|
||||
def single_infer(self, rgb_in: torch.Tensor, num_inference_steps: int, show_pbar: bool) -> torch.Tensor:
|
||||
def single_infer(
|
||||
self,
|
||||
rgb_in: torch.Tensor,
|
||||
num_inference_steps: int,
|
||||
seed: Union[int, None],
|
||||
show_pbar: bool,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Perform an individual depth prediction without ensembling.
|
||||
|
||||
@@ -286,10 +342,20 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
timesteps = self.scheduler.timesteps # [T]
|
||||
|
||||
# Encode image
|
||||
rgb_latent = self._encode_rgb(rgb_in)
|
||||
rgb_latent = self.encode_rgb(rgb_in)
|
||||
|
||||
# Initial depth map (noise)
|
||||
depth_latent = torch.randn(rgb_latent.shape, device=device, dtype=self.dtype) # [B, 4, h, w]
|
||||
if seed is None:
|
||||
rand_num_generator = None
|
||||
else:
|
||||
rand_num_generator = torch.Generator(device=device)
|
||||
rand_num_generator.manual_seed(seed)
|
||||
depth_latent = torch.randn(
|
||||
rgb_latent.shape,
|
||||
device=device,
|
||||
dtype=self.dtype,
|
||||
generator=rand_num_generator,
|
||||
) # [B, 4, h, w]
|
||||
|
||||
# Batched empty text embedding
|
||||
if self.empty_text_embed is None:
|
||||
@@ -314,9 +380,9 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
noise_pred = self.unet(unet_input, t, encoder_hidden_states=batch_empty_text_embed).sample # [B, 4, h, w]
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
depth_latent = self.scheduler.step(noise_pred, t, depth_latent).prev_sample
|
||||
torch.cuda.empty_cache()
|
||||
depth = self._decode_depth(depth_latent)
|
||||
depth_latent = self.scheduler.step(noise_pred, t, depth_latent, generator=rand_num_generator).prev_sample
|
||||
|
||||
depth = self.decode_depth(depth_latent)
|
||||
|
||||
# clip prediction
|
||||
depth = torch.clip(depth, -1.0, 1.0)
|
||||
@@ -325,7 +391,7 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
|
||||
return depth
|
||||
|
||||
def _encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
|
||||
def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Encode RGB image into latent.
|
||||
|
||||
@@ -344,7 +410,7 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
rgb_latent = mean * self.rgb_latent_scale_factor
|
||||
return rgb_latent
|
||||
|
||||
def _decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
|
||||
def decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Decode depth latent into depth map.
|
||||
|
||||
@@ -365,7 +431,7 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
return depth_mean
|
||||
|
||||
@staticmethod
|
||||
def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image:
|
||||
def resize_max_res(img: Image.Image, max_edge_resolution: int, resample_method=Resampling.BILINEAR) -> Image.Image:
|
||||
"""
|
||||
Resize image to limit maximum edge length while keeping aspect ratio.
|
||||
|
||||
@@ -374,6 +440,8 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
Image to be resized.
|
||||
max_edge_resolution (`int`):
|
||||
Maximum edge length (pixel).
|
||||
resample_method (`PIL.Image.Resampling`):
|
||||
Resampling method used to resize images.
|
||||
|
||||
Returns:
|
||||
`Image.Image`: Resized image.
|
||||
@@ -384,7 +452,7 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
new_width = int(original_width * downscale_factor)
|
||||
new_height = int(original_height * downscale_factor)
|
||||
|
||||
resized_img = img.resize((new_width, new_height))
|
||||
resized_img = img.resize((new_width, new_height), resample=resample_method)
|
||||
return resized_img
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -196,7 +196,7 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
|
||||
guidance_scale_tiles: specific weights for classifier-free guidance in each tile.
|
||||
guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used.
|
||||
seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter.
|
||||
seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overrriden.
|
||||
seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overriden.
|
||||
seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles.
|
||||
cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.
|
||||
|
||||
@@ -325,7 +325,7 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
|
||||
if accepts_eta:
|
||||
extra_step_kwargs["eta"] = eta
|
||||
|
||||
# Mask for tile weights strenght
|
||||
# Mask for tile weights strength
|
||||
tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size)
|
||||
|
||||
# Diffusion timesteps
|
||||
|
||||
@@ -373,18 +373,29 @@ class AnimateDiffControlNetPipeline(
|
||||
return prompt_embeds, negative_prompt_embeds
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
|
||||
def encode_image(self, image, device, num_images_per_prompt):
|
||||
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
|
||||
dtype = next(self.image_encoder.parameters()).dtype
|
||||
|
||||
if not isinstance(image, torch.Tensor):
|
||||
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
||||
|
||||
image = image.to(device=device, dtype=dtype)
|
||||
image_embeds = self.image_encoder(image).image_embeds
|
||||
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
if output_hidden_states:
|
||||
image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
|
||||
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
uncond_image_enc_hidden_states = self.image_encoder(
|
||||
torch.zeros_like(image), output_hidden_states=True
|
||||
).hidden_states[-2]
|
||||
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
|
||||
num_images_per_prompt, dim=0
|
||||
)
|
||||
return image_enc_hidden_states, uncond_image_enc_hidden_states
|
||||
else:
|
||||
image_embeds = self.image_encoder(image).image_embeds
|
||||
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
uncond_image_embeds = torch.zeros_like(image_embeds)
|
||||
|
||||
uncond_image_embeds = torch.zeros_like(image_embeds)
|
||||
return image_embeds, uncond_image_embeds
|
||||
return image_embeds, uncond_image_embeds
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
|
||||
def prepare_ip_adapter_image_embeds(
|
||||
@@ -832,7 +843,7 @@ class AnimateDiffControlNetPipeline(
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
the output of the pre-final layer will be used for computing the prompt embeddings.
|
||||
allback_on_step_end (`Callable`, *optional*):
|
||||
callback_on_step_end (`Callable`, *optional*):
|
||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
|
||||
@@ -840,7 +851,7 @@ class AnimateDiffControlNetPipeline(
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||
`._callback_tensor_inputs` attribute of your pipeine class.
|
||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -477,7 +477,12 @@ class DemoFusionSDXLPipeline(
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
|
||||
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
@@ -1280,7 +1285,7 @@ class DemoFusionSDXLPipeline(
|
||||
|
||||
return output_images
|
||||
|
||||
# Overrride to properly handle the loading and unloading of the additional text encoder.
|
||||
# Override to properly handle the loading and unloading of the additional text encoder.
|
||||
def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
|
||||
# We could have accessed the unet config from `lora_state_dict()` too. We pass
|
||||
# it here explicitly to be able to tell that it's coming from an SDXL
|
||||
|
||||
@@ -151,7 +151,7 @@ def concat_first(feat: torch.Tensor, dim: int = 2, scale: float = 1.0) -> torch.
|
||||
return torch.cat((feat, feat_style), dim=dim)
|
||||
|
||||
|
||||
def calc_mean_std(feat: torch.Tensor, eps: float = 1e-5) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
def calc_mean_std(feat: torch.Tensor, eps: float = 1e-5) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
feat_std = (feat.var(dim=-2, keepdims=True) + eps).sqrt()
|
||||
feat_mean = feat.mean(dim=-2, keepdims=True)
|
||||
return feat_mean, feat_std
|
||||
@@ -887,7 +887,7 @@ class StyleAlignedSDXLPipeline(
|
||||
# because `num_inference_steps` might be even given that every timestep
|
||||
# (except the highest one) is duplicated. If `num_inference_steps` is even it would
|
||||
# mean that we cut the timesteps in the middle of the denoising step
|
||||
# (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
|
||||
# (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
|
||||
# we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
|
||||
num_inference_steps = num_inference_steps + 1
|
||||
|
||||
@@ -919,7 +919,12 @@ class StyleAlignedSDXLPipeline(
|
||||
batch_size *= num_images_per_prompt
|
||||
|
||||
if image is None:
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
@@ -999,7 +1004,12 @@ class StyleAlignedSDXLPipeline(
|
||||
return latents
|
||||
|
||||
else:
|
||||
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
|
||||
1482
examples/community/pipeline_stable_diffusion_pag.py
Normal file
1482
examples/community/pipeline_stable_diffusion_pag.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -26,7 +26,7 @@ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInver
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d import LDM3DPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion_ldm3d.pipeline_stable_diffusion_ldm3d import LDM3DPipelineOutput
|
||||
from diffusers.schedulers import DDPMScheduler, KarrasDiffusionSchedulers
|
||||
from diffusers.utils import (
|
||||
USE_PEFT_BACKEND,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user