mirror of
https://github.com/huggingface/diffusers.git
synced 2025-12-06 20:44:33 +08:00
Compare commits
398 Commits
rename-att
...
fix-tests
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ed4cc212cc | ||
|
|
ee695ccd50 | ||
|
|
adc5d11d5c | ||
|
|
0aeefa051c | ||
|
|
e4f8dca9a0 | ||
|
|
0267c5233a | ||
|
|
be4afa0bb4 | ||
|
|
04f4bd54ea | ||
|
|
82be58c512 | ||
|
|
6695635696 | ||
|
|
b934215d4c | ||
|
|
5ed3abd371 | ||
|
|
1087a510b5 | ||
|
|
305f2b4498 | ||
|
|
cb0f3b49cb | ||
|
|
caf9e985df | ||
|
|
c1c42698c9 | ||
|
|
75aab34675 | ||
|
|
35358a2dec | ||
|
|
818f760732 | ||
|
|
f29b93488d | ||
|
|
d50baf0c63 | ||
|
|
c2217142bd | ||
|
|
8edaf3b79c | ||
|
|
23e091564f | ||
|
|
0d23645bd1 | ||
|
|
7fa3e5b0f6 | ||
|
|
49b959b540 | ||
|
|
58237364b1 | ||
|
|
3e35628873 | ||
|
|
6a479588db | ||
|
|
fa489eaed6 | ||
|
|
0d7c479023 | ||
|
|
ce97d7e19b | ||
|
|
44ba90caff | ||
|
|
3c85a57297 | ||
|
|
03ca11318e | ||
|
|
3ffa7b46e5 | ||
|
|
c1b2a89e34 | ||
|
|
435d37ce5a | ||
|
|
5915c2985d | ||
|
|
21a7ff12a7 | ||
|
|
8909ab4b19 | ||
|
|
c1edb03c37 | ||
|
|
0d08370263 | ||
|
|
b8ccb46259 | ||
|
|
725ead2f5e | ||
|
|
26a7851e1e | ||
|
|
3fd31eef51 | ||
|
|
b02e2113ff | ||
|
|
21f023ec1a | ||
|
|
31d9f9ea77 | ||
|
|
f53352f750 | ||
|
|
83ae24ce2d | ||
|
|
8af793b2d4 | ||
|
|
eb96ff0d59 | ||
|
|
a38dd79512 | ||
|
|
b1c5817a89 | ||
|
|
235d34cf56 | ||
|
|
5029673987 | ||
|
|
56bd7e67c2 | ||
|
|
9d16daaf64 | ||
|
|
8e4ca1b6b2 | ||
|
|
0d2d424fbe | ||
|
|
e24e54fdfa | ||
|
|
ebc99a77aa | ||
|
|
fa750a15bd | ||
|
|
181688012a | ||
|
|
142f353e1c | ||
|
|
b833d0fc80 | ||
|
|
e963621649 | ||
|
|
39215aa30e | ||
|
|
9ef43f38d4 | ||
|
|
88018fcf20 | ||
|
|
7404f1e9dc | ||
|
|
5a69227863 | ||
|
|
fc9fecc217 | ||
|
|
065f251766 | ||
|
|
21c747fa0f | ||
|
|
09129842e7 | ||
|
|
33b363edfa | ||
|
|
a9dd86029e | ||
|
|
9100652494 | ||
|
|
d1e3f489e9 | ||
|
|
ae05050db9 | ||
|
|
db969cc16d | ||
|
|
3cfe187dc7 | ||
|
|
90250d9e48 | ||
|
|
e5674015f3 | ||
|
|
b5c8b555d7 | ||
|
|
e23c27e905 | ||
|
|
7635d3d37f | ||
|
|
9132ce7c58 | ||
|
|
30c977d1f5 | ||
|
|
f0fa17dd8e | ||
|
|
c726d02beb | ||
|
|
a68503f221 | ||
|
|
9d50f7eec1 | ||
|
|
fda1531d8a | ||
|
|
cf6e0407e0 | ||
|
|
1c000d46e1 | ||
|
|
08bf754507 | ||
|
|
2f23437618 | ||
|
|
2523390c26 | ||
|
|
279de3c3ff | ||
|
|
8e14535708 | ||
|
|
0bee4d336b | ||
|
|
42f25d601a | ||
|
|
33c5d125cb | ||
|
|
aa1f00fd01 | ||
|
|
d95b993427 | ||
|
|
1d480298c1 | ||
|
|
b2323aa2b7 | ||
|
|
37e9d695af | ||
|
|
a402431de0 | ||
|
|
b99b1617cf | ||
|
|
3e4a6bd2d4 | ||
|
|
c827e94da0 | ||
|
|
44f6b859bf | ||
|
|
ac7ff7d4a3 | ||
|
|
a0cf607667 | ||
|
|
a341b536a8 | ||
|
|
8e46d97cd8 | ||
|
|
7e808e768a | ||
|
|
7e39516627 | ||
|
|
56a76082ed | ||
|
|
6133d98ff7 | ||
|
|
1c60e094de | ||
|
|
71f49a5d2a | ||
|
|
35db2fdea9 | ||
|
|
ad55ce6100 | ||
|
|
a9a5b14f35 | ||
|
|
aa19025989 | ||
|
|
19ab04ff56 | ||
|
|
c24addcd6a | ||
|
|
1a1817aec7 | ||
|
|
f54a8977cb | ||
|
|
9c842794c7 | ||
|
|
100df1756a | ||
|
|
10748e5d9d | ||
|
|
21ddb79fb9 | ||
|
|
91be9ac647 | ||
|
|
4a34307702 | ||
|
|
8e963d1c2a | ||
|
|
2b04ec2ff7 | ||
|
|
000fa82a1e | ||
|
|
5d83f50c23 | ||
|
|
5d21d4a204 | ||
|
|
73ba81090e | ||
|
|
7956c36aaa | ||
|
|
5266ab7935 | ||
|
|
7f724a930e | ||
|
|
9bef9f4be7 | ||
|
|
7aa4514260 | ||
|
|
c2e87869be | ||
|
|
ca61287daa | ||
|
|
f0c81562a4 | ||
|
|
9d20ed37a2 | ||
|
|
bda1d4faf8 | ||
|
|
77103d71ca | ||
|
|
0302446819 | ||
|
|
4d39b7483d | ||
|
|
fac761694a | ||
|
|
34c90dbb31 | ||
|
|
e49c04d5d6 | ||
|
|
f238cb0736 | ||
|
|
d78acdedc1 | ||
|
|
6df103deba | ||
|
|
73f28708be | ||
|
|
0cbc78f04c | ||
|
|
0cc5630945 | ||
|
|
0b8e29289d | ||
|
|
ab38ddf64f | ||
|
|
ead82fedea | ||
|
|
45b42d1203 | ||
|
|
5199ee4f7b | ||
|
|
544710ef0f | ||
|
|
443aa14e41 | ||
|
|
288632adf6 | ||
|
|
5ce79cbded | ||
|
|
d52f3e30f8 | ||
|
|
699dfb084c | ||
|
|
484c8ef399 | ||
|
|
0dd0528851 | ||
|
|
1cd4732e7f | ||
|
|
a51b6cc86a | ||
|
|
3bce0f3da1 | ||
|
|
9a34953823 | ||
|
|
e29f16cfaa | ||
|
|
f7dfcfd971 | ||
|
|
3c67864c5a | ||
|
|
363699044e | ||
|
|
9613576191 | ||
|
|
e4356d6488 | ||
|
|
82441460ef | ||
|
|
3e1097cb63 | ||
|
|
78990dd960 | ||
|
|
405a1facd2 | ||
|
|
3028089e5e | ||
|
|
b536f39818 | ||
|
|
e25e525fde | ||
|
|
de9adb907c | ||
|
|
bf861e65dc | ||
|
|
4da810b943 | ||
|
|
161c6e14b6 | ||
|
|
a6c9015c4e | ||
|
|
e6a5f99e5c | ||
|
|
80ff4ba63e | ||
|
|
b09a2aa308 | ||
|
|
63b6846849 | ||
|
|
139f707e6e | ||
|
|
e4546fd5bb | ||
|
|
d44e31aec2 | ||
|
|
ce9825b56b | ||
|
|
85f9d92883 | ||
|
|
916d9812a8 | ||
|
|
e6a8492242 | ||
|
|
ad0308b3f1 | ||
|
|
e97a633b63 | ||
|
|
01ac37b331 | ||
|
|
6a05b274cc | ||
|
|
98d46a3f08 | ||
|
|
4330a747d4 | ||
|
|
76de6a09fb | ||
|
|
25caf24ef9 | ||
|
|
8db3c9bc9f | ||
|
|
e0e9f81971 | ||
|
|
5d848ec07c | ||
|
|
4974b84564 | ||
|
|
83062fb872 | ||
|
|
b6d7e31d10 | ||
|
|
53e9aacc10 | ||
|
|
41424466e3 | ||
|
|
95de1981c9 | ||
|
|
0b45b58867 | ||
|
|
d3986f18be | ||
|
|
ee6a3a993d | ||
|
|
b300517305 | ||
|
|
ac07b6dc6a | ||
|
|
46ab56a468 | ||
|
|
038ff70023 | ||
|
|
00eca4b887 | ||
|
|
30132aba30 | ||
|
|
a17d6d6858 | ||
|
|
8efd9ce787 | ||
|
|
299c16d0f5 | ||
|
|
69f49195ac | ||
|
|
ed224f94ba | ||
|
|
531e719163 | ||
|
|
4fbd310fd2 | ||
|
|
2ea28d69dc | ||
|
|
a1cb106459 | ||
|
|
5dd8e04d4b | ||
|
|
165af7edd3 | ||
|
|
6c5f0de713 | ||
|
|
e64fdcf2ce | ||
|
|
ec64f371b1 | ||
|
|
cd6e1f1171 | ||
|
|
6f2b310a17 | ||
|
|
e3cd6cae50 | ||
|
|
e5ee05da76 | ||
|
|
e6ff752840 | ||
|
|
3f9c746fb2 | ||
|
|
1f22c98820 | ||
|
|
b4226bd6a7 | ||
|
|
46fac824be | ||
|
|
b33b64f595 | ||
|
|
9d9744075e | ||
|
|
d9a3b69806 | ||
|
|
f7e5954d5e | ||
|
|
8e19c073e5 | ||
|
|
f6df16cbb8 | ||
|
|
b24f78349c | ||
|
|
3ce905c9d0 | ||
|
|
f539497ab4 | ||
|
|
39dfb7abbd | ||
|
|
196835695e | ||
|
|
0d4dfbbd0a | ||
|
|
ada3bb941b | ||
|
|
b5814c5555 | ||
|
|
9940573618 | ||
|
|
59433ca1ae | ||
|
|
534f5d54fa | ||
|
|
40aa47b998 | ||
|
|
1bc0d37ffe | ||
|
|
eb942b866a | ||
|
|
687bc27727 | ||
|
|
6246c70d21 | ||
|
|
577b8a2783 | ||
|
|
13f0c8b219 | ||
|
|
fa1bdce3d4 | ||
|
|
ca6cdc77a9 | ||
|
|
f4977abcd8 | ||
|
|
df8559a7f9 | ||
|
|
8f206a5873 | ||
|
|
8da360aa12 | ||
|
|
869bad3e52 | ||
|
|
01ee0978cc | ||
|
|
56b68459f5 | ||
|
|
2ca264244b | ||
|
|
b9e1c30d0e | ||
|
|
03cd62520f | ||
|
|
001b14023e | ||
|
|
f55873b783 | ||
|
|
ccb93dcad1 | ||
|
|
ec953047bc | ||
|
|
9a2600ede9 | ||
|
|
5f150c4cef | ||
|
|
66f8bd6869 | ||
|
|
64a8cd627a | ||
|
|
5d3923b670 | ||
|
|
9451235e5a | ||
|
|
c2b6ac4e34 | ||
|
|
06b01ea87e | ||
|
|
f4fc75035f | ||
|
|
8f2d13c684 | ||
|
|
fcfa270fbd | ||
|
|
56dac1cedc | ||
|
|
3daebe2b44 | ||
|
|
abd922bd0c | ||
|
|
fa633ed6de | ||
|
|
2cad1a8465 | ||
|
|
e6cf21906d | ||
|
|
7db935a141 | ||
|
|
fa9bc029b4 | ||
|
|
2e31a759b5 | ||
|
|
e51862bbed | ||
|
|
8492db2332 | ||
|
|
f57e7bd92c | ||
|
|
3e3d46924b | ||
|
|
d71ecad8cd | ||
|
|
ac49f97a75 | ||
|
|
04bafcbbc2 | ||
|
|
7081a25618 | ||
|
|
848f9fe6ce | ||
|
|
8a692739c0 | ||
|
|
5aa31bd674 | ||
|
|
88aa7f6ebf | ||
|
|
ad310af0d6 | ||
|
|
d603ccb614 | ||
|
|
fd0f469568 | ||
|
|
ae84e405a3 | ||
|
|
3a66113306 | ||
|
|
7f16187182 | ||
|
|
f11b922b4f | ||
|
|
3dd4168d4c | ||
|
|
1c47d1fc05 | ||
|
|
bbf70c8739 | ||
|
|
738c986957 | ||
|
|
c09bb588d3 | ||
|
|
66a7160f9d | ||
|
|
f05ee56b2f | ||
|
|
34cc7f9b98 | ||
|
|
53605ed00a | ||
|
|
bb1b76d3bf | ||
|
|
e4b8f173b9 | ||
|
|
f0216b7756 | ||
|
|
d5f444de4b | ||
|
|
5a54dc9e95 | ||
|
|
6fedbd850a | ||
|
|
1b3cfb1b10 | ||
|
|
af13a90ebd | ||
|
|
3067da1261 | ||
|
|
6bceaea3fe | ||
|
|
baf9924be7 | ||
|
|
d8d208acde | ||
|
|
e0f33dfca4 | ||
|
|
15b125bb0e | ||
|
|
12004bf3a7 | ||
|
|
d2fc5ebb95 | ||
|
|
779eef95b4 | ||
|
|
d5b8d1ca04 | ||
|
|
eba7e7a6d7 | ||
|
|
31de879fb4 | ||
|
|
07349c25fe | ||
|
|
8974c50bff | ||
|
|
c18058b405 | ||
|
|
2938d5a672 | ||
|
|
d4ade821cd | ||
|
|
3a7e481611 | ||
|
|
d649d6c6f3 | ||
|
|
777063e1bf | ||
|
|
104afbce84 | ||
|
|
c0f5346a20 | ||
|
|
087daee2f0 | ||
|
|
7e164d98a8 | ||
|
|
e6d1728e0a | ||
|
|
8f2c7b4df0 | ||
|
|
2e387dad5f | ||
|
|
9efe1e52c3 | ||
|
|
37b09517b9 | ||
|
|
4343ce2c8e | ||
|
|
0ca7b68198 | ||
|
|
3cf4f9c735 | ||
|
|
40dd9cb2bd | ||
|
|
30bcda7de6 | ||
|
|
9ea62d119a | ||
|
|
a326d61118 |
38
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
38
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
@@ -66,32 +66,32 @@ body:
|
||||
Questions on DiffusionPipeline (Saving, Loading, From pretrained, ...):
|
||||
|
||||
Questions on pipelines:
|
||||
- Stable Diffusion @yiyixuxu @DN6 @sayakpaul @patrickvonplaten
|
||||
- Stable Diffusion XL @yiyixuxu @sayakpaul @DN6 @patrickvonplaten
|
||||
- Kandinsky @yiyixuxu @patrickvonplaten
|
||||
- ControlNet @sayakpaul @yiyixuxu @DN6 @patrickvonplaten
|
||||
- T2I Adapter @sayakpaul @yiyixuxu @DN6 @patrickvonplaten
|
||||
- IF @DN6 @patrickvonplaten
|
||||
- Text-to-Video / Video-to-Video @DN6 @sayakpaul @patrickvonplaten
|
||||
- Wuerstchen @DN6 @patrickvonplaten
|
||||
- Stable Diffusion @yiyixuxu @DN6 @sayakpaul
|
||||
- Stable Diffusion XL @yiyixuxu @sayakpaul @DN6
|
||||
- Kandinsky @yiyixuxu
|
||||
- ControlNet @sayakpaul @yiyixuxu @DN6
|
||||
- T2I Adapter @sayakpaul @yiyixuxu @DN6
|
||||
- IF @DN6
|
||||
- Text-to-Video / Video-to-Video @DN6 @sayakpaul
|
||||
- Wuerstchen @DN6
|
||||
- Other: @yiyixuxu @DN6
|
||||
|
||||
Questions on models:
|
||||
- UNet @DN6 @yiyixuxu @sayakpaul @patrickvonplaten
|
||||
- VAE @sayakpaul @DN6 @yiyixuxu @patrickvonplaten
|
||||
- Transformers/Attention @DN6 @yiyixuxu @sayakpaul @DN6 @patrickvonplaten
|
||||
- UNet @DN6 @yiyixuxu @sayakpaul
|
||||
- VAE @sayakpaul @DN6 @yiyixuxu
|
||||
- Transformers/Attention @DN6 @yiyixuxu @sayakpaul @DN6
|
||||
|
||||
Questions on Schedulers: @yiyixuxu @patrickvonplaten
|
||||
Questions on Schedulers: @yiyixuxu
|
||||
|
||||
Questions on LoRA: @sayakpaul @patrickvonplaten
|
||||
Questions on LoRA: @sayakpaul
|
||||
|
||||
Questions on Textual Inversion: @sayakpaul @patrickvonplaten
|
||||
Questions on Textual Inversion: @sayakpaul
|
||||
|
||||
Questions on Training:
|
||||
- DreamBooth @sayakpaul @patrickvonplaten
|
||||
- Text-to-Image Fine-tuning @sayakpaul @patrickvonplaten
|
||||
- Textual Inversion @sayakpaul @patrickvonplaten
|
||||
- ControlNet @sayakpaul @patrickvonplaten
|
||||
- DreamBooth @sayakpaul
|
||||
- Text-to-Image Fine-tuning @sayakpaul
|
||||
- Textual Inversion @sayakpaul
|
||||
- ControlNet @sayakpaul
|
||||
|
||||
Questions on Tests: @DN6 @sayakpaul @yiyixuxu
|
||||
|
||||
@@ -99,7 +99,7 @@ body:
|
||||
|
||||
Questions on JAX- and MPS-related things: @pcuenca
|
||||
|
||||
Questions on audio pipelines: @DN6 @patrickvonplaten
|
||||
Questions on audio pipelines: @DN6
|
||||
|
||||
|
||||
|
||||
|
||||
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
@@ -38,13 +38,13 @@ members/contributors who may be interested in your PR.
|
||||
|
||||
Core library:
|
||||
|
||||
- Schedulers: @yiyixuxu and @patrickvonplaten
|
||||
- Pipelines: @patrickvonplaten and @sayakpaul
|
||||
- Training examples: @sayakpaul and @patrickvonplaten
|
||||
- Docs: @stevhliu and @yiyixuxu
|
||||
- Schedulers: @yiyixuxu
|
||||
- Pipelines: @sayakpaul @yiyixuxu @DN6
|
||||
- Training examples: @sayakpaul
|
||||
- Docs: @stevhliu and @sayakpaul
|
||||
- JAX and MPS: @pcuenca
|
||||
- Audio: @sanchit-gandhi
|
||||
- General functionalities: @patrickvonplaten and @sayakpaul
|
||||
- General functionalities: @sayakpaul @yiyixuxu @DN6
|
||||
|
||||
Integrations:
|
||||
|
||||
|
||||
7
.github/workflows/benchmark.yml
vendored
7
.github/workflows/benchmark.yml
vendored
@@ -1,6 +1,7 @@
|
||||
name: Benchmarking tests
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM
|
||||
|
||||
@@ -30,9 +31,9 @@ jobs:
|
||||
nvidia-smi
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install pandas
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install pandas peft
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
75
.github/workflows/build_docker_images.yml
vendored
75
.github/workflows/build_docker_images.yml
vendored
@@ -1,21 +1,58 @@
|
||||
name: Build Docker images (nightly)
|
||||
name: Test, build, and push Docker images
|
||||
|
||||
on:
|
||||
pull_request: # During PRs, we just check if the changes Dockerfiles can be successfully built
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "docker/**"
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 0 * * *" # every day at midnight
|
||||
|
||||
concurrency:
|
||||
group: docker-image-builds
|
||||
cancel-in-progress: false
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
REGISTRY: diffusers
|
||||
CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }}
|
||||
|
||||
jobs:
|
||||
build-docker-images:
|
||||
runs-on: ubuntu-latest
|
||||
test-build-docker-images:
|
||||
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
if: github.event_name == 'pull_request'
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Find Changed Dockerfiles
|
||||
id: file_changes
|
||||
uses: jitterbit/get-changed-files@v1
|
||||
with:
|
||||
format: 'space-delimited'
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Build Changed Docker Images
|
||||
run: |
|
||||
CHANGED_FILES="${{ steps.file_changes.outputs.all }}"
|
||||
for FILE in $CHANGED_FILES; do
|
||||
if [[ "$FILE" == docker/*Dockerfile ]]; then
|
||||
DOCKER_PATH="${FILE%/Dockerfile}"
|
||||
DOCKER_TAG=$(basename "$DOCKER_PATH")
|
||||
echo "Building Docker image for $DOCKER_TAG"
|
||||
docker build -t "$DOCKER_TAG" "$DOCKER_PATH"
|
||||
fi
|
||||
done
|
||||
if: steps.file_changes.outputs.all != ''
|
||||
|
||||
build-and-push-docker-images:
|
||||
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
if: github.event_name != 'pull_request'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
@@ -36,13 +73,13 @@ jobs:
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ env.REGISTRY }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
@@ -50,3 +87,27 @@ jobs:
|
||||
context: ./docker/${{ matrix.image-name }}
|
||||
push: true
|
||||
tags: ${{ env.REGISTRY }}/${{ matrix.image-name }}:latest
|
||||
|
||||
- name: Post to a Slack channel
|
||||
id: slack
|
||||
uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
|
||||
with:
|
||||
# Slack channel id, channel name, or user id to post message.
|
||||
# See also: https://api.slack.com/methods/chat.postMessage#channels
|
||||
channel-id: ${{ env.CI_SLACK_CHANNEL }}
|
||||
# For posting a rich message using Block Kit
|
||||
payload: |
|
||||
{
|
||||
"text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}",
|
||||
"blocks": [
|
||||
{
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
"text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
4
.github/workflows/build_documentation.yml
vendored
4
.github/workflows/build_documentation.yml
vendored
@@ -7,6 +7,10 @@ on:
|
||||
- doc-builder*
|
||||
- v*-release
|
||||
- v*-patch
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "examples/**"
|
||||
- "docs/**"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
4
.github/workflows/build_pr_documentation.yml
vendored
4
.github/workflows/build_pr_documentation.yml
vendored
@@ -2,6 +2,10 @@ name: Build PR Documentation
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "examples/**"
|
||||
- "docs/**"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
|
||||
396
.github/workflows/nightly_tests.yml
vendored
396
.github/workflows/nightly_tests.yml
vendored
@@ -1,6 +1,7 @@
|
||||
name: Nightly tests on main
|
||||
name: Nightly and release tests on main/release branch
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 0 * * *" # every day at midnight
|
||||
|
||||
@@ -12,106 +13,348 @@ env:
|
||||
PYTEST_TIMEOUT: 600
|
||||
RUN_SLOW: yes
|
||||
RUN_NIGHTLY: yes
|
||||
PIPELINE_USAGE_CUTOFF: 5000
|
||||
SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
jobs:
|
||||
run_nightly_tests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- name: Nightly PyTorch CUDA tests on Ubuntu
|
||||
framework: pytorch
|
||||
runner: docker-gpu
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
report: torch_cuda
|
||||
- name: Nightly Flax TPU tests on Ubuntu
|
||||
framework: flax
|
||||
runner: docker-tpu
|
||||
image: diffusers/diffusers-flax-tpu
|
||||
report: flax_tpu
|
||||
- name: Nightly ONNXRuntime CUDA tests on Ubuntu
|
||||
framework: onnxruntime
|
||||
runner: docker-gpu
|
||||
image: diffusers/diffusers-onnxruntime-cuda
|
||||
report: onnx_cuda
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
|
||||
container:
|
||||
image: ${{ matrix.config.image }}
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
setup_torch_cuda_pipeline_matrix:
|
||||
name: Setup Torch Pipelines Matrix
|
||||
runs-on: diffusers/diffusers-pytorch-cpu
|
||||
outputs:
|
||||
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
if: ${{ matrix.config.runner == 'docker-gpu' }}
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
nvidia-smi
|
||||
pip install -e .
|
||||
pip install huggingface_hub
|
||||
- name: Fetch Pipeline Matrix
|
||||
id: fetch_pipeline_matrix
|
||||
run: |
|
||||
matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
|
||||
echo $matrix
|
||||
echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Pipeline Tests Artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: test-pipelines.json
|
||||
path: reports
|
||||
|
||||
run_nightly_tests_for_torch_pipelines:
|
||||
name: Torch Pipelines CUDA Nightly Tests
|
||||
needs: setup_torch_cuda_pipeline_matrix
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- name: NVIDIA-SMI
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install -U git+https://github.com/huggingface/transformers
|
||||
python -m pip install git+https://github.com/huggingface/accelerate
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run nightly PyTorch CUDA tests
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
- name: Nightly PyTorch CUDA checkpoint (pipelines) tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run nightly Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run nightly ONNXRuntime CUDA tests
|
||||
if: ${{ matrix.config.framework == 'onnxruntime' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
--report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
|
||||
run: |
|
||||
cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
|
||||
cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ matrix.config.report }}_test_reports
|
||||
name: pipeline_${{ matrix.module }}_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_nightly_tests_for_other_torch_modules:
|
||||
name: Torch Non-Pipelines CUDA Nightly Tests
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
module: [models, schedulers, others, examples]
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly PyTorch CUDA tests for non-pipeline modules
|
||||
if: ${{ matrix.module != 'examples'}}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_${{ matrix.module }}_cuda \
|
||||
--report-log=tests_torch_${{ matrix.module }}_cuda.log \
|
||||
tests/${{ matrix.module }}
|
||||
|
||||
- name: Run nightly example tests with Torch
|
||||
if: ${{ matrix.module == 'examples' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v --make-reports=examples_torch_cuda \
|
||||
--report-log=examples_torch_cuda.log \
|
||||
examples/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
|
||||
cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: torch_${{ matrix.module }}_cuda_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_lora_nightly_tests:
|
||||
name: Nightly LoRA Tests with PEFT and TORCH
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly LoRA tests with PEFT and Torch
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_lora_cuda \
|
||||
--report-log=tests_torch_lora_cuda.log \
|
||||
tests/lora
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_torch_lora_cuda_stats.txt
|
||||
cat reports/tests_torch_lora_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: torch_lora_cuda_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_flax_tpu_tests:
|
||||
name: Nightly Flax TPU Tests
|
||||
runs-on: docker-tpu
|
||||
if: github.event_name == 'schedule'
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-flax-tpu
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly Flax TPU tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_flax_tpu \
|
||||
--report-log=tests_flax_tpu.log \
|
||||
tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_flax_tpu_stats.txt
|
||||
cat reports/tests_flax_tpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: flax_tpu_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_nightly_onnx_tests:
|
||||
name: Nightly ONNXRuntime CUDA tests on Ubuntu
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-onnxruntime-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
run: python utils/print_env.py
|
||||
|
||||
- name: Run nightly ONNXRuntime CUDA tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_onnx_cuda \
|
||||
--report-log=tests_onnx_cuda.log \
|
||||
tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_onnx_cuda_stats.txt
|
||||
cat reports/tests_onnx_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ matrix.config.report }}_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_nightly_tests_apple_m1:
|
||||
name: Nightly PyTorch MPS tests on MacOS
|
||||
runs-on: [ self-hosted, apple-m1 ]
|
||||
if: github.event_name == 'schedule'
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -132,10 +375,11 @@ jobs:
|
||||
- name: Install dependencies
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pip install --upgrade pip
|
||||
${CONDA_RUN} python -m pip install -e .[quality,test]
|
||||
${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
|
||||
${CONDA_RUN} python -m pip install --upgrade pip uv
|
||||
${CONDA_RUN} python -m uv pip install -e [quality,test]
|
||||
${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
|
||||
${CONDA_RUN} python -m uv pip install pytest-reportlog
|
||||
|
||||
- name: Environment
|
||||
shell: arch -arch arm64 bash {0}
|
||||
@@ -148,7 +392,9 @@ jobs:
|
||||
HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps tests/
|
||||
${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
|
||||
--report-log=tests_torch_mps.log \
|
||||
tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
@@ -160,3 +406,9 @@ jobs:
|
||||
with:
|
||||
name: torch_mps_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
23
.github/workflows/notify_slack_about_release.yml
vendored
Normal file
23
.github/workflows/notify_slack_about_release.yml
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
name: Notify Slack about a release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
|
||||
- name: Notify Slack about the release
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
|
||||
run: pip install requests && python utils/notify_slack_about_release.py
|
||||
10
.github/workflows/pr_dependency_test.yml
vendored
10
.github/workflows/pr_dependency_test.yml
vendored
@@ -4,6 +4,8 @@ on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
@@ -23,10 +25,12 @@ jobs:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -e .
|
||||
pip install pytest
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install --upgrade pip uv
|
||||
python -m uv pip install -e .
|
||||
python -m uv pip install pytest
|
||||
- name: Check for soft dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
pytest tests/others/test_dependencies.py
|
||||
|
||||
16
.github/workflows/pr_flax_dependency_test.yml
vendored
16
.github/workflows/pr_flax_dependency_test.yml
vendored
@@ -4,6 +4,8 @@ on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
@@ -23,12 +25,14 @@ jobs:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -e .
|
||||
pip install "jax[cpu]>=0.2.16,!=0.3.2"
|
||||
pip install "flax>=0.4.1"
|
||||
pip install "jaxlib>=0.1.65"
|
||||
pip install pytest
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install --upgrade pip uv
|
||||
python -m uv pip install -e .
|
||||
python -m uv pip install "jax[cpu]>=0.2.16,!=0.3.2"
|
||||
python -m uv pip install "flax>=0.4.1"
|
||||
python -m uv pip install "jaxlib>=0.1.65"
|
||||
python -m uv pip install pytest
|
||||
- name: Check for soft dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
pytest tests/others/test_dependencies.py
|
||||
|
||||
49
.github/workflows/pr_quality.yml
vendored
49
.github/workflows/pr_quality.yml
vendored
@@ -1,49 +0,0 @@
|
||||
name: Run code quality checks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
check_code_quality:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
ruff check examples tests src utils scripts
|
||||
ruff format examples tests src utils scripts --check
|
||||
|
||||
check_repository_consistency:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
22
.github/workflows/pr_test_fetcher.yml
vendored
22
.github/workflows/pr_test_fetcher.yml
vendored
@@ -15,7 +15,7 @@ concurrency:
|
||||
jobs:
|
||||
setup_pr_tests:
|
||||
name: Setup PR Tests
|
||||
runs-on: docker-cpu
|
||||
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
@@ -32,8 +32,8 @@ jobs:
|
||||
fetch-depth: 0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -73,7 +73,7 @@ jobs:
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
|
||||
runs-on: docker-cpu
|
||||
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
@@ -88,16 +88,18 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install -e [quality,test]
|
||||
python -m pip install accelerate
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run all selected tests on CPU
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.modules }}_tests_cpu ${{ fromJson(needs.setup_pr_tests.outputs.test_map)[matrix.modules] }}
|
||||
|
||||
- name: Failure short reports
|
||||
@@ -121,7 +123,7 @@ jobs:
|
||||
config:
|
||||
- name: Hub tests for models, schedulers, and pipelines
|
||||
framework: hub_tests_pytorch
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_hub
|
||||
|
||||
@@ -143,16 +145,18 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install -e [quality,test]
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run Hub tests for models, schedulers, and pipelines on a staging env
|
||||
if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
HUGGINGFACE_CO_STAGING=true python -m pytest \
|
||||
-m "is_staging_test" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
|
||||
66
.github/workflows/pr_test_peft_backend.yml
vendored
66
.github/workflows/pr_test_peft_backend.yml
vendored
@@ -4,6 +4,9 @@ on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "tests/**.py"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -16,7 +19,50 @@ env:
|
||||
PYTEST_TIMEOUT: 60
|
||||
|
||||
jobs:
|
||||
check_code_quality:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: make quality
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
check_repository_consistency:
|
||||
needs: check_code_quality
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check repo consistency
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_fast_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -25,7 +71,7 @@ jobs:
|
||||
|
||||
name: LoRA - ${{ matrix.lib-versions }}
|
||||
|
||||
runs-on: docker-cpu
|
||||
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
@@ -43,23 +89,25 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
if [ "${{ matrix.lib-versions }}" == "main" ]; then
|
||||
python -m pip install -U git+https://github.com/huggingface/peft.git
|
||||
python -m pip install -U git+https://github.com/huggingface/transformers.git
|
||||
python -m pip install -U git+https://github.com/huggingface/accelerate.git
|
||||
python -m pip install -U peft@git+https://github.com/huggingface/peft.git
|
||||
python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
else
|
||||
python -m pip install -U peft transformers accelerate
|
||||
python -m uv pip install -U peft transformers accelerate
|
||||
fi
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch LoRA CPU tests with PEFT backend
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/lora/test_lora_layers_peft.py
|
||||
tests/lora/
|
||||
|
||||
89
.github/workflows/pr_tests.yml
vendored
89
.github/workflows/pr_tests.yml
vendored
@@ -4,6 +4,14 @@ on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "benchmarks/**.py"
|
||||
- "examples/**.py"
|
||||
- "scripts/**.py"
|
||||
- "tests/**.py"
|
||||
- ".github/**.yml"
|
||||
- "utils/**.py"
|
||||
push:
|
||||
branches:
|
||||
- ci-*
|
||||
@@ -19,29 +27,72 @@ env:
|
||||
PYTEST_TIMEOUT: 60
|
||||
|
||||
jobs:
|
||||
check_code_quality:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: make quality
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
check_repository_consistency:
|
||||
needs: check_code_quality
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check repo consistency
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
- name: Check if failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_fast_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- name: Fast PyTorch Pipeline CPU tests
|
||||
framework: pytorch_pipelines
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu_pipelines
|
||||
- name: Fast PyTorch Models & Schedulers CPU tests
|
||||
framework: pytorch_models
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu_models_schedulers
|
||||
- name: Fast Flax CPU tests
|
||||
framework: flax
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-flax-cpu
|
||||
report: flax_cpu
|
||||
- name: PyTorch Example CPU tests
|
||||
framework: pytorch_examples
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_example_cpu
|
||||
|
||||
@@ -65,18 +116,20 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install accelerate
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch Pipeline CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/pipelines
|
||||
@@ -84,7 +137,8 @@ jobs:
|
||||
- name: Run fast PyTorch Model Scheduler CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_models' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx and not Dependency" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/models tests/schedulers tests/others
|
||||
@@ -92,7 +146,8 @@ jobs:
|
||||
- name: Run fast Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests
|
||||
@@ -100,8 +155,9 @@ jobs:
|
||||
- name: Run example PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m pip install peft
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install peft
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples
|
||||
|
||||
@@ -117,13 +173,14 @@ jobs:
|
||||
path: reports
|
||||
|
||||
run_staging_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- name: Hub tests for models, schedulers, and pipelines
|
||||
framework: hub_tests_pytorch
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_hub
|
||||
|
||||
@@ -147,16 +204,18 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run Hub tests for models, schedulers, and pipelines on a staging env
|
||||
if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
HUGGINGFACE_CO_STAGING=true python -m pytest \
|
||||
-m "is_staging_test" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
|
||||
12
.github/workflows/pr_torch_dependency_test.yml
vendored
12
.github/workflows/pr_torch_dependency_test.yml
vendored
@@ -4,6 +4,8 @@ on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
@@ -23,10 +25,12 @@ jobs:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -e .
|
||||
pip install torch torchvision torchaudio
|
||||
pip install pytest
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install --upgrade pip uv
|
||||
python -m uv pip install -e .
|
||||
python -m uv pip install torch torchvision torchaudio
|
||||
python -m uv pip install pytest
|
||||
- name: Check for soft dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
pytest tests/others/test_dependencies.py
|
||||
|
||||
109
.github/workflows/push_tests.yml
vendored
109
.github/workflows/push_tests.yml
vendored
@@ -4,7 +4,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "examples/**.py"
|
||||
- "tests/**.py"
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
@@ -18,10 +21,9 @@ env:
|
||||
jobs:
|
||||
setup_torch_cuda_pipeline_matrix:
|
||||
name: Setup Torch Pipelines CUDA Slow Tests Matrix
|
||||
runs-on: docker-gpu
|
||||
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
|
||||
options: --shm-size "16gb" --ipc host
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
outputs:
|
||||
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
|
||||
steps:
|
||||
@@ -31,21 +33,17 @@ jobs:
|
||||
fetch-depth: 2
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Fetch Pipeline Matrix
|
||||
id: fetch_pipeline_matrix
|
||||
run: |
|
||||
matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
|
||||
echo $matrix
|
||||
echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Pipeline Tests Artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
@@ -58,13 +56,13 @@ jobs:
|
||||
needs: setup_torch_cuda_pipeline_matrix
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
max-parallel: 8
|
||||
matrix:
|
||||
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
|
||||
runs-on: docker-gpu
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -73,11 +71,17 @@ jobs:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
- name: Tailscale
|
||||
uses: huggingface/tailscale-action@v1
|
||||
with:
|
||||
authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
|
||||
slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
|
||||
slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -91,6 +95,12 @@ jobs:
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
- name: Tailscale Wait
|
||||
if: ${{ failure() || runner.debug == '1' }}
|
||||
uses: huggingface/tailscale-action@v1
|
||||
with:
|
||||
waitForSSH: true
|
||||
authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
@@ -106,16 +116,16 @@ jobs:
|
||||
|
||||
torch_cuda_tests:
|
||||
name: Torch CUDA Tests
|
||||
runs-on: docker-gpu
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
module: [models, schedulers, lora, others]
|
||||
module: [models, schedulers, lora, others, single_file]
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -124,9 +134,9 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -158,10 +168,10 @@ jobs:
|
||||
|
||||
peft_cuda_tests:
|
||||
name: PEFT CUDA Tests
|
||||
runs-on: docker-gpu
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -173,10 +183,10 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
python -m pip install git+https://github.com/huggingface/peft.git
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m pip install -U peft@git+https://github.com/huggingface/peft.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -211,7 +221,7 @@ jobs:
|
||||
runs-on: docker-tpu
|
||||
container:
|
||||
image: diffusers/diffusers-flax-tpu
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -223,9 +233,9 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -255,10 +265,10 @@ jobs:
|
||||
|
||||
onnx_cuda_tests:
|
||||
name: ONNX CUDA Tests
|
||||
runs-on: docker-gpu
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-onnxruntime-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -270,9 +280,9 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -303,11 +313,11 @@ jobs:
|
||||
run_torch_compile_tests:
|
||||
name: PyTorch Compile CUDA tests
|
||||
|
||||
runs-on: docker-gpu
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-compile-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -320,7 +330,8 @@ jobs:
|
||||
nvidia-smi
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install -e .[quality,test,training]
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test,training]
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -343,11 +354,11 @@ jobs:
|
||||
run_xformers_tests:
|
||||
name: PyTorch xformers CUDA tests
|
||||
|
||||
runs-on: docker-gpu
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-xformers-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -360,7 +371,8 @@ jobs:
|
||||
nvidia-smi
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install -e .[quality,test,training]
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test,training]
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -383,11 +395,11 @@ jobs:
|
||||
run_examples_tests:
|
||||
name: Examples PyTorch CUDA tests on Ubuntu
|
||||
|
||||
runs-on: docker-gpu
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -401,16 +413,19 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install -e .[quality,test,training]
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test,training]
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run example tests on GPU
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
|
||||
- name: Failure short reports
|
||||
@@ -424,4 +439,4 @@ jobs:
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: examples_test_reports
|
||||
path: reports
|
||||
path: reports
|
||||
|
||||
31
.github/workflows/push_tests_fast.yml
vendored
31
.github/workflows/push_tests_fast.yml
vendored
@@ -4,6 +4,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "examples/**.py"
|
||||
- "tests/**.py"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -25,22 +29,22 @@ jobs:
|
||||
config:
|
||||
- name: Fast PyTorch CPU tests on Ubuntu
|
||||
framework: pytorch
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu
|
||||
- name: Fast Flax CPU tests on Ubuntu
|
||||
framework: flax
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-flax-cpu
|
||||
report: flax_cpu
|
||||
- name: Fast ONNXRuntime CPU tests on Ubuntu
|
||||
framework: onnxruntime
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-onnxruntime-cpu
|
||||
report: onnx_cpu
|
||||
- name: PyTorch Example CPU tests on Ubuntu
|
||||
framework: pytorch_examples
|
||||
runner: docker-cpu
|
||||
runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_example_cpu
|
||||
|
||||
@@ -64,17 +68,19 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
@@ -82,7 +88,8 @@ jobs:
|
||||
- name: Run fast Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
@@ -90,7 +97,8 @@ jobs:
|
||||
- name: Run fast ONNXRuntime CPU tests
|
||||
if: ${{ matrix.config.framework == 'onnxruntime' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
@@ -98,8 +106,9 @@ jobs:
|
||||
- name: Run example PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m pip install peft
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install peft
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples
|
||||
|
||||
|
||||
13
.github/workflows/push_tests_mps.yml
vendored
13
.github/workflows/push_tests_mps.yml
vendored
@@ -4,6 +4,9 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "tests/**.py"
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
@@ -41,11 +44,11 @@ jobs:
|
||||
- name: Install dependencies
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pip install --upgrade pip
|
||||
${CONDA_RUN} python -m pip install -e .[quality,test]
|
||||
${CONDA_RUN} python -m pip install torch torchvision torchaudio
|
||||
${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
${CONDA_RUN} python -m pip install transformers --upgrade
|
||||
${CONDA_RUN} python -m pip install --upgrade pip uv
|
||||
${CONDA_RUN} python -m uv pip install -e [quality,test]
|
||||
${CONDA_RUN} python -m uv pip install torch torchvision torchaudio
|
||||
${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
${CONDA_RUN} python -m uv pip install transformers --upgrade
|
||||
|
||||
- name: Environment
|
||||
shell: arch -arch arm64 bash {0}
|
||||
|
||||
81
.github/workflows/pypi_publish.yaml
vendored
Normal file
81
.github/workflows/pypi_publish.yaml
vendored
Normal file
@@ -0,0 +1,81 @@
|
||||
# Adapted from https://blog.deepjyoti30.dev/pypi-release-github-action
|
||||
|
||||
name: PyPI release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
tags:
|
||||
- "*"
|
||||
|
||||
jobs:
|
||||
find-and-checkout-latest-branch:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
latest_branch: ${{ steps.set_latest_branch.outputs.latest_branch }}
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
|
||||
- name: Fetch latest branch
|
||||
id: fetch_latest_branch
|
||||
run: |
|
||||
pip install -U requests packaging
|
||||
LATEST_BRANCH=$(python utils/fetch_latest_release_branch.py)
|
||||
echo "Latest branch: $LATEST_BRANCH"
|
||||
echo "latest_branch=$LATEST_BRANCH" >> $GITHUB_ENV
|
||||
|
||||
- name: Set latest branch output
|
||||
id: set_latest_branch
|
||||
run: echo "::set-output name=latest_branch::${{ env.latest_branch }}"
|
||||
|
||||
release:
|
||||
needs: find-and-checkout-latest-branch
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ needs.find-and-checkout-latest-branch.outputs.latest_branch }}
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -U setuptools wheel twine
|
||||
pip install -U torch --index-url https://download.pytorch.org/whl/cpu
|
||||
pip install -U transformers
|
||||
|
||||
- name: Build the dist files
|
||||
run: python setup.py bdist_wheel && python setup.py sdist
|
||||
|
||||
- name: Publish to the test PyPI
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
|
||||
run: twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
|
||||
|
||||
- name: Test installing diffusers and importing
|
||||
run: |
|
||||
pip install diffusers && pip uninstall diffusers -y
|
||||
pip install -i https://testpypi.python.org/pypi diffusers
|
||||
python -c "from diffusers import __version__; print(__version__)"
|
||||
python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('fusing/unet-ldm-dummy-update'); pipe()"
|
||||
python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('hf-internal-testing/tiny-stable-diffusion-pipe', safety_checker=None); pipe('ah suh du')"
|
||||
python -c "from diffusers import *"
|
||||
|
||||
- name: Publish to PyPI
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
run: twine upload dist/* -r pypi
|
||||
46
.github/workflows/ssh-runner.yml
vendored
Normal file
46
.github/workflows/ssh-runner.yml
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
name: SSH into runners
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
runner_type:
|
||||
description: 'Type of runner to test (a10 or t4)'
|
||||
required: true
|
||||
docker_image:
|
||||
description: 'Name of the Docker image'
|
||||
required: true
|
||||
|
||||
env:
|
||||
IS_GITHUB_CI: "1"
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
HF_HOME: /mnt/cache
|
||||
DIFFUSERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
RUN_SLOW: yes
|
||||
|
||||
jobs:
|
||||
ssh_runner:
|
||||
name: "SSH"
|
||||
runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
|
||||
container:
|
||||
image: ${{ github.event.inputs.docker_image }}
|
||||
options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Tailscale # In order to be able to SSH when a test fails
|
||||
uses: huggingface/tailscale-action@v1
|
||||
with:
|
||||
authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
|
||||
slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
|
||||
slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
waitForSSH: true
|
||||
30
.github/workflows/update_metadata.yml
vendored
Normal file
30
.github/workflows/update_metadata.yml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Update Diffusers metadata
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- update_diffusers_metadata*
|
||||
|
||||
jobs:
|
||||
update_metadata:
|
||||
runs-on: ubuntu-22.04
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -l {0}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Setup environment
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install datasets pandas
|
||||
pip install .[torch]
|
||||
|
||||
- name: Update metadata
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
|
||||
run: |
|
||||
python utils/update_metadata.py --commit_sha ${{ github.sha }}
|
||||
10
CITATION.cff
10
CITATION.cff
@@ -19,6 +19,16 @@ authors:
|
||||
family-names: Rasul
|
||||
- given-names: Mishig
|
||||
family-names: Davaadorj
|
||||
- given-names: Dhruv
|
||||
family-names: Nair
|
||||
- given-names: Sayak
|
||||
family-names: Paul
|
||||
- given-names: Steven
|
||||
family-names: Liu
|
||||
- given-names: William
|
||||
family-names: Berman
|
||||
- given-names: Yiyi
|
||||
family-names: Xu
|
||||
- given-names: Thomas
|
||||
family-names: Wolf
|
||||
repository-code: 'https://github.com/huggingface/diffusers'
|
||||
|
||||
2
Makefile
2
Makefile
@@ -42,6 +42,7 @@ repo-consistency:
|
||||
quality:
|
||||
ruff check $(check_dirs) setup.py
|
||||
ruff format --check $(check_dirs) setup.py
|
||||
doc-builder style src/diffusers docs/source --max_len 119 --check_only
|
||||
python utils/check_doc_toc.py
|
||||
|
||||
# Format source code automatically and check is there are any problems left that need manual fixing
|
||||
@@ -55,6 +56,7 @@ extra_style_checks:
|
||||
style:
|
||||
ruff check $(check_dirs) setup.py --fix
|
||||
ruff format $(check_dirs) setup.py
|
||||
doc-builder style src/diffusers docs/source --max_len 119
|
||||
${MAKE} autogenerate_code
|
||||
${MAKE} extra_style_checks
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi
|
||||
|
||||
## Quickstart
|
||||
|
||||
Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 19000+ checkpoints):
|
||||
Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 22000+ checkpoints):
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
|
||||
- https://github.com/deep-floyd/IF
|
||||
- https://github.com/bentoml/BentoML
|
||||
- https://github.com/bmaltais/kohya_ss
|
||||
- +8000 other amazing GitHub repositories 💪
|
||||
- +9000 other amazing GitHub repositories 💪
|
||||
|
||||
Thank you for using us ❤️.
|
||||
|
||||
@@ -238,7 +238,7 @@ We also want to thank @heejkoo for the very helpful overview of papers, code and
|
||||
|
||||
```bibtex
|
||||
@misc{von-platen-etal-2022-diffusers,
|
||||
author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Thomas Wolf},
|
||||
author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Dhruv Nair and Sayak Paul and William Berman and Yiyi Xu and Steven Liu and Thomas Wolf},
|
||||
title = {Diffusers: State-of-the-art diffusion models},
|
||||
year = {2022},
|
||||
publisher = {GitHub},
|
||||
|
||||
@@ -141,6 +141,7 @@ class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
|
||||
super().__init__(args)
|
||||
self.pipe.load_lora_weights(self.lora_id)
|
||||
self.pipe.fuse_lora()
|
||||
self.pipe.unload_lora_weights()
|
||||
self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
|
||||
|
||||
def get_result_filepath(self, args):
|
||||
@@ -235,6 +236,35 @@ class InpaintingBenchmark(ImageToImageBenchmark):
|
||||
)
|
||||
|
||||
|
||||
class IPAdapterTextToImageBenchmark(TextToImageBenchmark):
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png"
|
||||
image = load_image(url)
|
||||
|
||||
def __init__(self, args):
|
||||
pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda")
|
||||
pipe.load_ip_adapter(
|
||||
args.ip_adapter_id[0],
|
||||
subfolder="models" if "sdxl" not in args.ip_adapter_id[1] else "sdxl_models",
|
||||
weight_name=args.ip_adapter_id[1],
|
||||
)
|
||||
|
||||
if args.run_compile:
|
||||
pipe.unet.to(memory_format=torch.channels_last)
|
||||
print("Run torch compile")
|
||||
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
pipe.set_progress_bar_config(disable=True)
|
||||
self.pipe = pipe
|
||||
|
||||
def run_inference(self, pipe, args):
|
||||
_ = pipe(
|
||||
prompt=PROMPT,
|
||||
ip_adapter_image=self.image,
|
||||
num_inference_steps=args.num_inference_steps,
|
||||
num_images_per_prompt=args.batch_size,
|
||||
)
|
||||
|
||||
|
||||
class ControlNetBenchmark(TextToImageBenchmark):
|
||||
pipeline_class = StableDiffusionControlNetPipeline
|
||||
aux_network_class = ControlNetModel
|
||||
|
||||
32
benchmarks/benchmark_ip_adapters.py
Normal file
32
benchmarks/benchmark_ip_adapters.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
|
||||
sys.path.append(".")
|
||||
from base_classes import IPAdapterTextToImageBenchmark # noqa: E402
|
||||
|
||||
|
||||
IP_ADAPTER_CKPTS = {
|
||||
"runwayml/stable-diffusion-v1-5": ("h94/IP-Adapter", "ip-adapter_sd15.bin"),
|
||||
"stabilityai/stable-diffusion-xl-base-1.0": ("h94/IP-Adapter", "ip-adapter_sdxl.bin"),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--ckpt",
|
||||
type=str,
|
||||
default="runwayml/stable-diffusion-v1-5",
|
||||
choices=list(IP_ADAPTER_CKPTS.keys()),
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=1)
|
||||
parser.add_argument("--num_inference_steps", type=int, default=50)
|
||||
parser.add_argument("--model_cpu_offload", action="store_true")
|
||||
parser.add_argument("--run_compile", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
args.ip_adapter_id = IP_ADAPTER_CKPTS[args.ckpt]
|
||||
benchmark_pipe = IPAdapterTextToImageBenchmark(args)
|
||||
args.ckpt = f"{args.ckpt} (IP-Adapter)"
|
||||
benchmark_pipe.benchmark(args)
|
||||
@@ -72,7 +72,7 @@ def main():
|
||||
command += " --run_compile"
|
||||
run_command(command.split())
|
||||
|
||||
elif file == "benchmark_sd_inpainting.py":
|
||||
elif file in ["benchmark_sd_inpainting.py", "benchmark_ip_adapters.py"]:
|
||||
sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
command = f"python {file} --ckpt {sdxl_ckpt}"
|
||||
run_command(command.split())
|
||||
|
||||
@@ -4,32 +4,36 @@ LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
libgl1 \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --upgrade --no-cache-dir \
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3 -m uv pip install --upgrade --no-cache-dir \
|
||||
clu \
|
||||
"jax[cpu]>=0.2.16,!=0.3.2" \
|
||||
"flax>=0.4.1" \
|
||||
"jaxlib>=0.1.65" && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -4,34 +4,38 @@ LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
python3.8 \
|
||||
libgl1 \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
"jax[tpu]>=0.2.16,!=0.3.2" \
|
||||
-f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
|
||||
python3 -m pip install --upgrade --no-cache-dir \
|
||||
python3 -m uv pip install --upgrade --no-cache-dir \
|
||||
clu \
|
||||
"flax>=0.4.1" \
|
||||
"jaxlib>=0.1.65" && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -4,32 +4,36 @@ LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
python3.8 \
|
||||
libgl1 \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
torch==2.1.2 \
|
||||
torchvision==0.16.2 \
|
||||
torchaudio==2.1.2 \
|
||||
onnxruntime \
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -1,35 +1,39 @@
|
||||
FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
|
||||
FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
python3.8 \
|
||||
libgl1 \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
torch==2.1.2 \
|
||||
torchvision==0.16.2 \
|
||||
torchaudio==2.1.2 \
|
||||
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3.10 -m uv pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
"onnxruntime-gpu>=1.13.1" \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu117 && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
python3.10 -m uv pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -4,8 +4,11 @@ LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
@@ -13,24 +16,23 @@ RUN apt update && \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
libgl1 \
|
||||
python3.9 \
|
||||
python3.9-dev \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
python3.9-venv && \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3.9 -m venv /opt/venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3.9 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3.9 -m pip install --no-cache-dir \
|
||||
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3.10 -m uv pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
invisible_watermark && \
|
||||
python3.9 -m pip install --no-cache-dir \
|
||||
python3.10 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -4,33 +4,36 @@ LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
python3.8 \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
libgl1 \
|
||||
python3.8-venv && \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3.10 -m uv pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
invisible_watermark \
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
python3.10 -m uv pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
@@ -40,6 +43,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers
|
||||
transformers matplotlib
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -4,8 +4,11 @@ LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
@@ -13,23 +16,23 @@ RUN apt update && \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
libgl1 \
|
||||
python3.8 \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3.10 -m uv pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
invisible_watermark && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
python3.10 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -4,8 +4,11 @@ LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
@@ -13,23 +16,23 @@ RUN apt update && \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
libgl1 \
|
||||
python3.8 \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3.10 -m pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
invisible_watermark && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
python3.10 -m uv pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -242,10 +242,10 @@ Here's an example of a tuple return, comprising several objects:
|
||||
|
||||
```
|
||||
Returns:
|
||||
`tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
|
||||
- ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
|
||||
`tuple(torch.Tensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
|
||||
- ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.Tensor` of shape `(1,)` --
|
||||
Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
|
||||
- **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
|
||||
- **prediction_scores** (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
```
|
||||
|
||||
|
||||
@@ -18,151 +18,139 @@
|
||||
- local: tutorials/basic_training
|
||||
title: Train a diffusion model
|
||||
- local: tutorials/using_peft_for_inference
|
||||
title: Inference with PEFT
|
||||
title: Load LoRAs for inference
|
||||
- local: tutorials/fast_diffusion
|
||||
title: Accelerate inference of text-to-image diffusion models
|
||||
title: Tutorials
|
||||
- sections:
|
||||
- sections:
|
||||
- local: using-diffusers/loading_overview
|
||||
title: Overview
|
||||
- local: using-diffusers/loading
|
||||
title: Load pipelines, models, and schedulers
|
||||
- local: using-diffusers/schedulers
|
||||
title: Load and compare different schedulers
|
||||
- local: using-diffusers/custom_pipeline_overview
|
||||
title: Load community pipelines and components
|
||||
- local: using-diffusers/using_safetensors
|
||||
title: Load safetensors
|
||||
- local: using-diffusers/other-formats
|
||||
title: Load different Stable Diffusion formats
|
||||
- local: using-diffusers/loading_adapters
|
||||
title: Load adapters
|
||||
- local: using-diffusers/push_to_hub
|
||||
title: Push files to the Hub
|
||||
title: Loading & Hub
|
||||
- sections:
|
||||
- local: using-diffusers/pipeline_overview
|
||||
title: Overview
|
||||
- local: using-diffusers/unconditional_image_generation
|
||||
title: Unconditional image generation
|
||||
- local: using-diffusers/conditional_image_generation
|
||||
title: Text-to-image
|
||||
- local: using-diffusers/img2img
|
||||
title: Image-to-image
|
||||
- local: using-diffusers/inpaint
|
||||
title: Inpainting
|
||||
- local: using-diffusers/depth2img
|
||||
title: Depth-to-image
|
||||
title: Tasks
|
||||
- sections:
|
||||
- local: using-diffusers/textual_inversion_inference
|
||||
title: Textual inversion
|
||||
- local: training/distributed_inference
|
||||
title: Distributed inference with multiple GPUs
|
||||
- local: using-diffusers/reusing_seeds
|
||||
title: Improve image quality with deterministic generation
|
||||
- local: using-diffusers/control_brightness
|
||||
title: Control image brightness
|
||||
- local: using-diffusers/weighted_prompts
|
||||
title: Prompt weighting
|
||||
- local: using-diffusers/freeu
|
||||
title: Improve generation quality with FreeU
|
||||
title: Techniques
|
||||
- sections:
|
||||
- local: using-diffusers/pipeline_overview
|
||||
title: Overview
|
||||
- local: using-diffusers/sdxl
|
||||
title: Stable Diffusion XL
|
||||
- local: using-diffusers/sdxl_turbo
|
||||
title: SDXL Turbo
|
||||
- local: using-diffusers/kandinsky
|
||||
title: Kandinsky
|
||||
- local: using-diffusers/controlnet
|
||||
title: ControlNet
|
||||
- local: using-diffusers/shap-e
|
||||
title: Shap-E
|
||||
- local: using-diffusers/diffedit
|
||||
title: DiffEdit
|
||||
- local: using-diffusers/distilled_sd
|
||||
title: Distilled Stable Diffusion inference
|
||||
- local: using-diffusers/callback
|
||||
title: Pipeline callbacks
|
||||
- local: using-diffusers/reproducibility
|
||||
title: Create reproducible pipelines
|
||||
- local: using-diffusers/custom_pipeline_examples
|
||||
title: Community pipelines
|
||||
- local: using-diffusers/contribute_pipeline
|
||||
title: Contribute a community pipeline
|
||||
- local: using-diffusers/inference_with_lcm_lora
|
||||
title: Latent Consistency Model-LoRA
|
||||
- local: using-diffusers/inference_with_lcm
|
||||
title: Latent Consistency Model
|
||||
- local: using-diffusers/svd
|
||||
title: Stable Video Diffusion
|
||||
title: Specific pipeline examples
|
||||
- sections:
|
||||
- local: training/overview
|
||||
title: Overview
|
||||
- local: training/create_dataset
|
||||
title: Create a dataset for training
|
||||
- local: training/adapt_a_model
|
||||
title: Adapt a model to a new task
|
||||
- sections:
|
||||
- local: training/unconditional_training
|
||||
title: Unconditional image generation
|
||||
- local: training/text2image
|
||||
title: Text-to-image
|
||||
- local: training/sdxl
|
||||
title: Stable Diffusion XL
|
||||
- local: training/kandinsky
|
||||
title: Kandinsky 2.2
|
||||
- local: training/wuerstchen
|
||||
title: Wuerstchen
|
||||
- local: training/controlnet
|
||||
title: ControlNet
|
||||
- local: training/t2i_adapters
|
||||
title: T2I-Adapters
|
||||
- local: training/instructpix2pix
|
||||
title: InstructPix2Pix
|
||||
title: Models
|
||||
- sections:
|
||||
- local: training/text_inversion
|
||||
title: Textual Inversion
|
||||
- local: training/dreambooth
|
||||
title: DreamBooth
|
||||
- local: training/lora
|
||||
title: LoRA
|
||||
- local: training/custom_diffusion
|
||||
title: Custom Diffusion
|
||||
- local: training/lcm_distill
|
||||
title: Latent Consistency Distillation
|
||||
- local: training/ddpo
|
||||
title: Reinforcement learning training with DDPO
|
||||
title: Methods
|
||||
title: Training
|
||||
- sections:
|
||||
- local: using-diffusers/other-modalities
|
||||
title: Other Modalities
|
||||
title: Taking Diffusers Beyond Images
|
||||
title: Using Diffusers
|
||||
- local: using-diffusers/loading
|
||||
title: Load pipelines
|
||||
- local: using-diffusers/custom_pipeline_overview
|
||||
title: Load community pipelines and components
|
||||
- local: using-diffusers/schedulers
|
||||
title: Load schedulers and models
|
||||
- local: using-diffusers/using_safetensors
|
||||
title: Load safetensors
|
||||
- local: using-diffusers/other-formats
|
||||
title: Load different Stable Diffusion formats
|
||||
- local: using-diffusers/loading_adapters
|
||||
title: Load adapters
|
||||
- local: using-diffusers/push_to_hub
|
||||
title: Push files to the Hub
|
||||
title: Load pipelines and adapters
|
||||
- sections:
|
||||
- local: optimization/opt_overview
|
||||
- local: using-diffusers/unconditional_image_generation
|
||||
title: Unconditional image generation
|
||||
- local: using-diffusers/conditional_image_generation
|
||||
title: Text-to-image
|
||||
- local: using-diffusers/img2img
|
||||
title: Image-to-image
|
||||
- local: using-diffusers/inpaint
|
||||
title: Inpainting
|
||||
- local: using-diffusers/text-img2vid
|
||||
title: Text or image-to-video
|
||||
- local: using-diffusers/depth2img
|
||||
title: Depth-to-image
|
||||
title: Generative tasks
|
||||
- sections:
|
||||
- local: using-diffusers/overview_techniques
|
||||
title: Overview
|
||||
- local: training/distributed_inference
|
||||
title: Distributed inference with multiple GPUs
|
||||
- local: using-diffusers/merge_loras
|
||||
title: Merge LoRAs
|
||||
- local: using-diffusers/callback
|
||||
title: Pipeline callbacks
|
||||
- local: using-diffusers/reusing_seeds
|
||||
title: Reproducible pipelines
|
||||
- local: using-diffusers/image_quality
|
||||
title: Controlling image quality
|
||||
- local: using-diffusers/weighted_prompts
|
||||
title: Prompt techniques
|
||||
title: Inference techniques
|
||||
- sections:
|
||||
- local: using-diffusers/sdxl
|
||||
title: Stable Diffusion XL
|
||||
- local: using-diffusers/sdxl_turbo
|
||||
title: SDXL Turbo
|
||||
- local: using-diffusers/kandinsky
|
||||
title: Kandinsky
|
||||
- local: using-diffusers/ip_adapter
|
||||
title: IP-Adapter
|
||||
- local: using-diffusers/controlnet
|
||||
title: ControlNet
|
||||
- local: using-diffusers/t2i_adapter
|
||||
title: T2I-Adapter
|
||||
- local: using-diffusers/inference_with_lcm
|
||||
title: Latent Consistency Model
|
||||
- local: using-diffusers/textual_inversion_inference
|
||||
title: Textual inversion
|
||||
- local: using-diffusers/shap-e
|
||||
title: Shap-E
|
||||
- local: using-diffusers/diffedit
|
||||
title: DiffEdit
|
||||
- local: using-diffusers/inference_with_tcd_lora
|
||||
title: Trajectory Consistency Distillation-LoRA
|
||||
- local: using-diffusers/svd
|
||||
title: Stable Video Diffusion
|
||||
title: Specific pipeline examples
|
||||
- sections:
|
||||
- local: training/overview
|
||||
title: Overview
|
||||
- local: training/create_dataset
|
||||
title: Create a dataset for training
|
||||
- local: training/adapt_a_model
|
||||
title: Adapt a model to a new task
|
||||
- sections:
|
||||
- local: optimization/fp16
|
||||
title: Speed up inference
|
||||
- local: optimization/memory
|
||||
title: Reduce memory usage
|
||||
- local: optimization/torch2.0
|
||||
title: PyTorch 2.0
|
||||
- local: optimization/xformers
|
||||
title: xFormers
|
||||
- local: optimization/tome
|
||||
title: Token merging
|
||||
- local: optimization/deepcache
|
||||
title: DeepCache
|
||||
title: General optimizations
|
||||
- local: training/unconditional_training
|
||||
title: Unconditional image generation
|
||||
- local: training/text2image
|
||||
title: Text-to-image
|
||||
- local: training/sdxl
|
||||
title: Stable Diffusion XL
|
||||
- local: training/kandinsky
|
||||
title: Kandinsky 2.2
|
||||
- local: training/wuerstchen
|
||||
title: Wuerstchen
|
||||
- local: training/controlnet
|
||||
title: ControlNet
|
||||
- local: training/t2i_adapters
|
||||
title: T2I-Adapters
|
||||
- local: training/instructpix2pix
|
||||
title: InstructPix2Pix
|
||||
title: Models
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: training/text_inversion
|
||||
title: Textual Inversion
|
||||
- local: training/dreambooth
|
||||
title: DreamBooth
|
||||
- local: training/lora
|
||||
title: LoRA
|
||||
- local: training/custom_diffusion
|
||||
title: Custom Diffusion
|
||||
- local: training/lcm_distill
|
||||
title: Latent Consistency Distillation
|
||||
- local: training/ddpo
|
||||
title: Reinforcement learning training with DDPO
|
||||
title: Methods
|
||||
isExpanded: false
|
||||
title: Training
|
||||
- sections:
|
||||
- local: optimization/fp16
|
||||
title: Speed up inference
|
||||
- local: optimization/memory
|
||||
title: Reduce memory usage
|
||||
- local: optimization/torch2.0
|
||||
title: PyTorch 2.0
|
||||
- local: optimization/xformers
|
||||
title: xFormers
|
||||
- local: optimization/tome
|
||||
title: Token merging
|
||||
- local: optimization/deepcache
|
||||
title: DeepCache
|
||||
- local: optimization/tgate
|
||||
title: TGATE
|
||||
- sections:
|
||||
- local: using-diffusers/stable_diffusion_jax_how_to
|
||||
title: JAX/Flax
|
||||
@@ -172,14 +160,14 @@
|
||||
title: OpenVINO
|
||||
- local: optimization/coreml
|
||||
title: Core ML
|
||||
title: Optimized model types
|
||||
title: Optimized model formats
|
||||
- sections:
|
||||
- local: optimization/mps
|
||||
title: Metal Performance Shaders (MPS)
|
||||
- local: optimization/habana
|
||||
title: Habana Gaudi
|
||||
title: Optimized hardware
|
||||
title: Optimization
|
||||
title: Accelerate inference and reduce memory
|
||||
- sections:
|
||||
- local: conceptual/philosophy
|
||||
title: Philosophy
|
||||
@@ -201,6 +189,7 @@
|
||||
- local: api/outputs
|
||||
title: Outputs
|
||||
title: Main Classes
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: api/loaders/ip_adapter
|
||||
title: IP-Adapter
|
||||
@@ -215,6 +204,7 @@
|
||||
- local: api/loaders/peft
|
||||
title: PEFT
|
||||
title: Loaders
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: api/models/overview
|
||||
title: Overview
|
||||
@@ -249,6 +239,7 @@
|
||||
- local: api/models/controlnet
|
||||
title: ControlNet
|
||||
title: Models
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: api/pipelines/overview
|
||||
title: Overview
|
||||
@@ -272,6 +263,10 @@
|
||||
title: ControlNet
|
||||
- local: api/pipelines/controlnet_sdxl
|
||||
title: ControlNet with Stable Diffusion XL
|
||||
- local: api/pipelines/controlnetxs
|
||||
title: ControlNet-XS
|
||||
- local: api/pipelines/controlnetxs_sdxl
|
||||
title: ControlNet-XS with Stable Diffusion XL
|
||||
- local: api/pipelines/dance_diffusion
|
||||
title: Dance Diffusion
|
||||
- local: api/pipelines/ddim
|
||||
@@ -298,6 +293,8 @@
|
||||
title: Latent Consistency Models
|
||||
- local: api/pipelines/latent_diffusion
|
||||
title: Latent Diffusion
|
||||
- local: api/pipelines/ledits_pp
|
||||
title: LEDITS++
|
||||
- local: api/pipelines/panorama
|
||||
title: MultiDiffusion
|
||||
- local: api/pipelines/musicldm
|
||||
@@ -314,6 +311,8 @@
|
||||
title: Semantic Guidance
|
||||
- local: api/pipelines/shap_e
|
||||
title: Shap-E
|
||||
- local: api/pipelines/stable_cascade
|
||||
title: Stable Cascade
|
||||
- sections:
|
||||
- local: api/pipelines/stable_diffusion/overview
|
||||
title: Overview
|
||||
@@ -321,6 +320,8 @@
|
||||
title: Text-to-image
|
||||
- local: api/pipelines/stable_diffusion/img2img
|
||||
title: Image-to-image
|
||||
- local: api/pipelines/stable_diffusion/svd
|
||||
title: Image-to-video
|
||||
- local: api/pipelines/stable_diffusion/inpaint
|
||||
title: Inpainting
|
||||
- local: api/pipelines/stable_diffusion/depth2img
|
||||
@@ -344,7 +345,7 @@
|
||||
- local: api/pipelines/stable_diffusion/ldm3d_diffusion
|
||||
title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
|
||||
- local: api/pipelines/stable_diffusion/adapter
|
||||
title: Stable Diffusion T2I-Adapter
|
||||
title: T2I-Adapter
|
||||
- local: api/pipelines/stable_diffusion/gligen
|
||||
title: GLIGEN (Grounded Language-to-Image Generation)
|
||||
title: Stable Diffusion
|
||||
@@ -363,6 +364,7 @@
|
||||
- local: api/pipelines/wuerstchen
|
||||
title: Wuerstchen
|
||||
title: Pipelines
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: api/schedulers/overview
|
||||
title: Overview
|
||||
@@ -386,6 +388,10 @@
|
||||
title: DPMSolverSDEScheduler
|
||||
- local: api/schedulers/singlestep_dpm_solver
|
||||
title: DPMSolverSinglestepScheduler
|
||||
- local: api/schedulers/edm_multistep_dpm_solver
|
||||
title: EDMDPMSolverMultistepScheduler
|
||||
- local: api/schedulers/edm_euler
|
||||
title: EDMEulerScheduler
|
||||
- local: api/schedulers/euler_ancestral
|
||||
title: EulerAncestralDiscreteScheduler
|
||||
- local: api/schedulers/euler
|
||||
@@ -412,11 +418,14 @@
|
||||
title: ScoreSdeVeScheduler
|
||||
- local: api/schedulers/score_sde_vp
|
||||
title: ScoreSdeVpScheduler
|
||||
- local: api/schedulers/tcd
|
||||
title: TCDScheduler
|
||||
- local: api/schedulers/unipc
|
||||
title: UniPCMultistepScheduler
|
||||
- local: api/schedulers/vq_diffusion
|
||||
title: VQDiffusionScheduler
|
||||
title: Schedulers
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: api/internal_classes_overview
|
||||
title: Overview
|
||||
@@ -430,5 +439,8 @@
|
||||
title: Utilities
|
||||
- local: api/image_processor
|
||||
title: VAE Image Processor
|
||||
- local: api/video_processor
|
||||
title: Video Processor
|
||||
title: Internal classes
|
||||
isExpanded: false
|
||||
title: API
|
||||
|
||||
@@ -20,14 +20,14 @@ An attention processor is a class for applying different types of attention mech
|
||||
## AttnProcessor2_0
|
||||
[[autodoc]] models.attention_processor.AttnProcessor2_0
|
||||
|
||||
## FusedAttnProcessor2_0
|
||||
[[autodoc]] models.attention_processor.FusedAttnProcessor2_0
|
||||
## AttnAddedKVProcessor
|
||||
[[autodoc]] models.attention_processor.AttnAddedKVProcessor
|
||||
|
||||
## LoRAAttnProcessor
|
||||
[[autodoc]] models.attention_processor.LoRAAttnProcessor
|
||||
## AttnAddedKVProcessor2_0
|
||||
[[autodoc]] models.attention_processor.AttnAddedKVProcessor2_0
|
||||
|
||||
## LoRAAttnProcessor2_0
|
||||
[[autodoc]] models.attention_processor.LoRAAttnProcessor2_0
|
||||
## CrossFrameAttnProcessor
|
||||
[[autodoc]] pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor
|
||||
|
||||
## CustomDiffusionAttnProcessor
|
||||
[[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor
|
||||
@@ -35,26 +35,26 @@ An attention processor is a class for applying different types of attention mech
|
||||
## CustomDiffusionAttnProcessor2_0
|
||||
[[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor2_0
|
||||
|
||||
## AttnAddedKVProcessor
|
||||
[[autodoc]] models.attention_processor.AttnAddedKVProcessor
|
||||
## CustomDiffusionXFormersAttnProcessor
|
||||
[[autodoc]] models.attention_processor.CustomDiffusionXFormersAttnProcessor
|
||||
|
||||
## AttnAddedKVProcessor2_0
|
||||
[[autodoc]] models.attention_processor.AttnAddedKVProcessor2_0
|
||||
## FusedAttnProcessor2_0
|
||||
[[autodoc]] models.attention_processor.FusedAttnProcessor2_0
|
||||
|
||||
## LoRAAttnAddedKVProcessor
|
||||
[[autodoc]] models.attention_processor.LoRAAttnAddedKVProcessor
|
||||
|
||||
## XFormersAttnProcessor
|
||||
[[autodoc]] models.attention_processor.XFormersAttnProcessor
|
||||
|
||||
## LoRAXFormersAttnProcessor
|
||||
[[autodoc]] models.attention_processor.LoRAXFormersAttnProcessor
|
||||
|
||||
## CustomDiffusionXFormersAttnProcessor
|
||||
[[autodoc]] models.attention_processor.CustomDiffusionXFormersAttnProcessor
|
||||
|
||||
## SlicedAttnProcessor
|
||||
[[autodoc]] models.attention_processor.SlicedAttnProcessor
|
||||
|
||||
## SlicedAttnAddedKVProcessor
|
||||
[[autodoc]] models.attention_processor.SlicedAttnAddedKVProcessor
|
||||
|
||||
## XFormersAttnProcessor
|
||||
[[autodoc]] models.attention_processor.XFormersAttnProcessor
|
||||
|
||||
## AttnProcessorNPU
|
||||
[[autodoc]] models.attention_processor.AttnProcessorNPU
|
||||
|
||||
@@ -25,3 +25,11 @@ All pipelines with [`VaeImageProcessor`] accept PIL Image, PyTorch tensor, or Nu
|
||||
The [`VaeImageProcessorLDM3D`] accepts RGB and depth inputs and returns RGB and depth outputs.
|
||||
|
||||
[[autodoc]] image_processor.VaeImageProcessorLDM3D
|
||||
|
||||
## PixArtImageProcessor
|
||||
|
||||
[[autodoc]] image_processor.PixArtImageProcessor
|
||||
|
||||
## IPAdapterMaskProcessor
|
||||
|
||||
[[autodoc]] image_processor.IPAdapterMaskProcessor
|
||||
|
||||
@@ -12,14 +12,18 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# IP-Adapter
|
||||
|
||||
[IP-Adapter](https://hf.co/papers/2308.06721) is a lightweight adapter that enables prompting a diffusion model with an image. This method decouples the cross-attention layers of the image and text features. The image features are generated from an image encoder. Files generated from IP-Adapter are only ~100MBs.
|
||||
[IP-Adapter](https://hf.co/papers/2308.06721) is a lightweight adapter that enables prompting a diffusion model with an image. This method decouples the cross-attention layers of the image and text features. The image features are generated from an image encoder.
|
||||
|
||||
<Tip>
|
||||
|
||||
Learn how to load an IP-Adapter checkpoint and image in the [IP-Adapter](../../using-diffusers/loading_adapters#ip-adapter) loading guide.
|
||||
Learn how to load an IP-Adapter checkpoint and image in the IP-Adapter [loading](../../using-diffusers/loading_adapters#ip-adapter) guide, and you can see how to use it in the [usage](../../using-diffusers/ip_adapter) guide.
|
||||
|
||||
</Tip>
|
||||
|
||||
## IPAdapterMixin
|
||||
|
||||
[[autodoc]] loaders.ip_adapter.IPAdapterMixin
|
||||
|
||||
## IPAdapterMaskProcessor
|
||||
|
||||
[[autodoc]] image_processor.IPAdapterMaskProcessor
|
||||
@@ -10,13 +10,124 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Single files
|
||||
# Loading Pipelines and Models via `from_single_file`
|
||||
|
||||
Diffusers supports loading pretrained pipeline (or model) weights stored in a single file, such as a `ckpt` or `safetensors` file. These single file types are typically produced from community trained models. There are three classes for loading single file weights:
|
||||
The `from_single_file` method allows you to load supported pipelines using a single checkpoint file as opposed to the folder format used by Diffusers. This is useful if you are working with many of the Stable Diffusion Web UI's (such as A1111) that extensively rely on a single file to distribute all the components of a diffusion model.
|
||||
|
||||
- [`FromSingleFileMixin`] supports loading pretrained pipeline weights stored in a single file, which can either be a `ckpt` or `safetensors` file.
|
||||
- [`FromOriginalVAEMixin`] supports loading a pretrained [`AutoencoderKL`] from pretrained ControlNet weights stored in a single file, which can either be a `ckpt` or `safetensors` file.
|
||||
- [`FromOriginalControlnetMixin`] supports loading pretrained ControlNet weights stored in a single file, which can either be a `ckpt` or `safetensors` file.
|
||||
The `from_single_file` method also supports loading models in their originally distributed format. This means that supported models that have been finetuned with other services can be loaded directly into supported Diffusers model objects and pipelines.
|
||||
|
||||
## Pipelines that currently support `from_single_file` loading
|
||||
|
||||
- [`StableDiffusionPipeline`]
|
||||
- [`StableDiffusionImg2ImgPipeline`]
|
||||
- [`StableDiffusionInpaintPipeline`]
|
||||
- [`StableDiffusionControlNetPipeline`]
|
||||
- [`StableDiffusionControlNetImg2ImgPipeline`]
|
||||
- [`StableDiffusionControlNetInpaintPipeline`]
|
||||
- [`StableDiffusionUpscalePipeline`]
|
||||
- [`StableDiffusionXLPipeline`]
|
||||
- [`StableDiffusionXLImg2ImgPipeline`]
|
||||
- [`StableDiffusionXLInpaintPipeline`]
|
||||
- [`StableDiffusionXLInstructPix2PixPipeline`]
|
||||
- [`StableDiffusionXLControlNetPipeline`]
|
||||
- [`StableDiffusionXLKDiffusionPipeline`]
|
||||
- [`LatentConsistencyModelPipeline`]
|
||||
- [`LatentConsistencyModelImg2ImgPipeline`]
|
||||
- [`StableDiffusionControlNetXSPipeline`]
|
||||
- [`StableDiffusionXLControlNetXSPipeline`]
|
||||
- [`LEditsPPPipelineStableDiffusion`]
|
||||
- [`LEditsPPPipelineStableDiffusionXL`]
|
||||
- [`PIAPipeline`]
|
||||
|
||||
## Models that currently support `from_single_file` loading
|
||||
|
||||
- [`UNet2DConditionModel`]
|
||||
- [`StableCascadeUNet`]
|
||||
- [`AutoencoderKL`]
|
||||
- [`ControlNetModel`]
|
||||
|
||||
## Usage Examples
|
||||
|
||||
## Loading a Pipeline using `from_single_file`
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path)
|
||||
```
|
||||
|
||||
## Setting components in a Pipeline using `from_single_file`
|
||||
|
||||
Swap components of the pipeline by passing them directly to the `from_single_file` method. e.g If you would like use a different scheduler than the pipeline default.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLPipeline, DDIMScheduler
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
|
||||
scheduler = DDIMScheduler()
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, scheduler=scheduler)
|
||||
|
||||
```
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline, ControlNetModel
|
||||
|
||||
ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors")
|
||||
pipe = StableDiffusionPipeline.from_single_file(ckpt_path, controlnet=controlnet)
|
||||
|
||||
```
|
||||
|
||||
## Loading a Model using `from_single_file`
|
||||
|
||||
```python
|
||||
from diffusers import StableCascadeUNet
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_lite.safetensors"
|
||||
model = StableCascadeUNet.from_single_file(ckpt_path)
|
||||
|
||||
```
|
||||
|
||||
## Using a Diffusers model repository to configure single file loading
|
||||
|
||||
Under the hood, `from_single_file` will try to determine a model repository to use to configure the components of the pipeline. You can also pass in a repository id to the `config` argument of the `from_single_file` method to explicitly set the repository to use.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/segmind/SSD-1B/blob/main/SSD-1B.safetensors"
|
||||
repo_id = "segmind/SSD-1B"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, config=repo_id)
|
||||
|
||||
```
|
||||
|
||||
## Override configuration options when using single file loading
|
||||
|
||||
Override the default model or pipeline configuration options when using `from_single_file` by passing in the relevant arguments directly to the `from_single_file` method. Any argument that is supported by the model or pipeline class can be configured in this way:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLInstructPix2PixPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/cosxl/blob/main/cosxl_edit.safetensors"
|
||||
pipe = StableDiffusionXLInstructPix2PixPipeline.from_single_file(ckpt_path, config="diffusers/sdxl-instructpix2pix-768", is_cosxl_edit=True)
|
||||
|
||||
```
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DConditionModel
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
model = UNet2DConditionModel.from_single_file(ckpt_path, upcast_attention=True)
|
||||
|
||||
```
|
||||
|
||||
In the example above, since we explicitly passed `repo_id="segmind/SSD-1B"`, it will use this [configuration file](https://huggingface.co/segmind/SSD-1B/blob/main/unet/config.json) from the "unet" subfolder in `"segmind/SSD-1B"` to configure the unet component included in the checkpoint; Similarly, it will use the `config.json` file from `"vae"` subfolder to configure the vae model, `config.json` file from text_encoder folder to configure text_encoder and so on.
|
||||
|
||||
Note that most of the time you do not need to explicitly a `config` argument, `from_single_file` will automatically map the checkpoint to a repo id (we will discuss this in more details in next section). However, this can be useful in cases where model components might have been changed from what was originally distributed or in cases where a checkpoint file might not have the necessary metadata to correctly determine the configuration to use for the pipeline.
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -24,14 +135,114 @@ To learn more about how to load single file weights, see the [Load different Sta
|
||||
|
||||
</Tip>
|
||||
|
||||
## Working with local files
|
||||
|
||||
As of `diffusers>=0.28.0` the `from_single_file` method will attempt to configure a pipeline or model by first inferring the model type from the checkpoint file and then using the model type to determine the appropriate model repo configuration to use from the Hugging Face Hub. For example, any single file checkpoint based on the Stable Diffusion XL base model will use the [`stabilityai/stable-diffusion-xl-base-1.0`](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) model repo to configure the pipeline.
|
||||
|
||||
If you are working in an environment with restricted internet access, it is recommended to download the config files and checkpoints for the model to your preferred directory and pass the local paths to the `pretrained_model_link_or_path` and `config` arguments of the `from_single_file` method.
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
|
||||
my_local_checkpoint_path = hf_hub_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
filename="SSD-1B.safetensors"
|
||||
)
|
||||
|
||||
my_local_config_path = snapshot_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
|
||||
)
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
|
||||
|
||||
```
|
||||
|
||||
By default this will download the checkpoints and config files to the [Hugging Face Hub cache directory](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache). You can also specify a local directory to download the files to by passing the `local_dir` argument to the `hf_hub_download` and `snapshot_download` functions.
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
|
||||
my_local_checkpoint_path = hf_hub_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
filename="SSD-1B.safetensors"
|
||||
local_dir="my_local_checkpoints"
|
||||
)
|
||||
|
||||
my_local_config_path = snapshot_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
|
||||
local_dir="my_local_config"
|
||||
)
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
|
||||
|
||||
```
|
||||
|
||||
## Working with local files on file systems that do not support symlinking
|
||||
|
||||
By default the `from_single_file` method relies on the `huggingface_hub` caching mechanism to fetch and store checkpoints and config files for models and pipelines. If you are working with a file system that does not support symlinking, it is recommended that you first download the checkpoint file to a local directory and disable symlinking by passing the `local_dir_use_symlink=False` argument to the `hf_hub_download` and `snapshot_download` functions.
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
|
||||
my_local_checkpoint_path = hf_hub_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
filename="SSD-1B.safetensors"
|
||||
local_dir="my_local_checkpoints",
|
||||
local_dir_use_symlinks=False
|
||||
)
|
||||
print("My local checkpoint: ", my_local_checkpoint_path)
|
||||
|
||||
my_local_config_path = snapshot_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
|
||||
local_dir_use_symlinks=False,
|
||||
)
|
||||
print("My local config: ", my_local_config_path)
|
||||
|
||||
```
|
||||
|
||||
Then pass the local paths to the `pretrained_model_link_or_path` and `config` arguments of the `from_single_file` method.
|
||||
|
||||
```python
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
|
||||
|
||||
```
|
||||
|
||||
<Tip>
|
||||
Disabling symlinking means that the `huggingface_hub` caching mechanism has no way to determine whether a file has already been downloaded to the local directory. This means that the `hf_hub_download` and `snapshot_download` functions will download files to the local directory each time they are executed. If you are disabling symlinking, it is recommended that you separate the model download and loading steps to avoid downloading the same file multiple times.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Using the original configuration file of a model
|
||||
|
||||
If you would like to configure the parameters of the model components in the pipeline using the orignal YAML configuration file, you can pass a local path or url to the original configuration file to the `original_config` argument of the `from_single_file` method.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
original_config = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, original_config=original_config)
|
||||
```
|
||||
|
||||
In the example above, the `original_config` file is only used to configure the parameters of the individual model components of the pipeline. For example it will be used to configure parameters such as the `in_channels` of the `vae` model and `unet` model. It is not used to determine the type of component objects in the pipeline.
|
||||
|
||||
|
||||
<Tip>
|
||||
When using `original_config` with local_files_only=True`, Diffusers will attempt to infer the components based on the type signatures of pipeline class, rather than attempting to fetch the pipeline config from the Hugging Face Hub. This is to prevent backwards breaking changes in existing code that might not be able to connect to the internet to fetch the necessary pipeline config files.
|
||||
|
||||
This is not as reliable as providing a path to a local config repo and might lead to errors when configuring the pipeline. To avoid this, please run the pipeline with `local_files_only=False` once to download the appropriate pipeline config files to the local cache.
|
||||
</Tip>
|
||||
|
||||
|
||||
## FromSingleFileMixin
|
||||
|
||||
[[autodoc]] loaders.single_file.FromSingleFileMixin
|
||||
|
||||
## FromOriginalVAEMixin
|
||||
## FromOriginalModelMixin
|
||||
|
||||
[[autodoc]] loaders.autoencoder.FromOriginalVAEMixin
|
||||
|
||||
## FromOriginalControlnetMixin
|
||||
|
||||
[[autodoc]] loaders.controlnet.FromOriginalControlNetMixin
|
||||
[[autodoc]] loaders.single_file_model.FromOriginalModelMixin
|
||||
|
||||
@@ -1,6 +1,18 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Consistency Decoder
|
||||
|
||||
Consistency decoder can be used to decode the latents from the denoising UNet in the [`StableDiffusionPipeline`]. This decoder was introduced in the [DALL-E 3 technical report](https://openai.com/dall-e-3).
|
||||
Consistency decoder can be used to decode the latents from the denoising UNet in the [`StableDiffusionPipeline`]. This decoder was introduced in the [DALL-E 3 technical report](https://openai.com/dall-e-3).
|
||||
|
||||
The original codebase can be found at [openai/consistencydecoder](https://github.com/openai/consistencydecoder).
|
||||
|
||||
|
||||
@@ -101,6 +101,53 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you
|
||||
|
||||
</Tip>
|
||||
|
||||
### AnimateDiffSDXLPipeline
|
||||
|
||||
AnimateDiff can also be used with SDXL models. This is currently an experimental feature as only a beta release of the motion adapter checkpoint is available.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers.models import MotionAdapter
|
||||
from diffusers import AnimateDiffSDXLPipeline, DDIMScheduler
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-sdxl-beta", torch_dtype=torch.float16)
|
||||
|
||||
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
scheduler = DDIMScheduler.from_pretrained(
|
||||
model_id,
|
||||
subfolder="scheduler",
|
||||
clip_sample=False,
|
||||
timestep_spacing="linspace",
|
||||
beta_schedule="linear",
|
||||
steps_offset=1,
|
||||
)
|
||||
pipe = AnimateDiffSDXLPipeline.from_pretrained(
|
||||
model_id,
|
||||
motion_adapter=adapter,
|
||||
scheduler=scheduler,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
|
||||
# enable memory savings
|
||||
pipe.enable_vae_slicing()
|
||||
pipe.enable_vae_tiling()
|
||||
|
||||
output = pipe(
|
||||
prompt="a panda surfing in the ocean, realistic, high quality",
|
||||
negative_prompt="low quality, worst quality",
|
||||
num_inference_steps=20,
|
||||
guidance_scale=8,
|
||||
width=1024,
|
||||
height=1024,
|
||||
num_frames=16,
|
||||
)
|
||||
|
||||
frames = output.frames[0]
|
||||
export_to_gif(frames, "animation.gif")
|
||||
```
|
||||
|
||||
### AnimateDiffVideoToVideoPipeline
|
||||
|
||||
AnimateDiff can also be used to generate visually similar videos or enable style/character/background or other edits starting from an initial video, allowing you to seamlessly explore creative possibilities.
|
||||
@@ -408,12 +455,126 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
|
||||
|
||||
</Tip>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align=center>Without FreeInit enabled</th>
|
||||
<th align=center>With FreeInit enabled</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align=center>
|
||||
panda playing a guitar
|
||||
<br />
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-no-freeinit.gif"
|
||||
alt="panda playing a guitar"
|
||||
style="width: 300px;" />
|
||||
</td>
|
||||
<td align=center>
|
||||
panda playing a guitar
|
||||
<br/>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-freeinit.gif"
|
||||
alt="panda playing a guitar"
|
||||
style="width: 300px;" />
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## Using AnimateLCM
|
||||
|
||||
[AnimateLCM](https://animatelcm.github.io/) is a motion module checkpoint and an [LCM LoRA](https://huggingface.co/docs/diffusers/using-diffusers/inference_with_lcm_lora) that have been created using a consistency learning strategy that decouples the distillation of the image generation priors and the motion generation priors.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
|
||||
pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter)
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
|
||||
|
||||
pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="sd15_lora_beta.safetensors", adapter_name="lcm-lora")
|
||||
|
||||
pipe.enable_vae_slicing()
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
output = pipe(
|
||||
prompt="A space rocket with trails of smoke behind it launching into space from the desert, 4k, high resolution",
|
||||
negative_prompt="bad quality, worse quality, low resolution",
|
||||
num_frames=16,
|
||||
guidance_scale=1.5,
|
||||
num_inference_steps=6,
|
||||
generator=torch.Generator("cpu").manual_seed(0),
|
||||
)
|
||||
frames = output.frames[0]
|
||||
export_to_gif(frames, "animatelcm.gif")
|
||||
```
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><center>
|
||||
A space rocket, 4K.
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatelcm-output.gif"
|
||||
alt="A space rocket, 4K"
|
||||
style="width: 300px;" />
|
||||
</center></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
AnimateLCM is also compatible with existing [Motion LoRAs](https://huggingface.co/collections/dn6/animatediff-motion-loras-654cb8ad732b9e3cf4d3c17e).
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
|
||||
pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter)
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
|
||||
|
||||
pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="sd15_lora_beta.safetensors", adapter_name="lcm-lora")
|
||||
pipe.load_lora_weights("guoyww/animatediff-motion-lora-tilt-up", adapter_name="tilt-up")
|
||||
|
||||
pipe.set_adapters(["lcm-lora", "tilt-up"], [1.0, 0.8])
|
||||
pipe.enable_vae_slicing()
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
output = pipe(
|
||||
prompt="A space rocket with trails of smoke behind it launching into space from the desert, 4k, high resolution",
|
||||
negative_prompt="bad quality, worse quality, low resolution",
|
||||
num_frames=16,
|
||||
guidance_scale=1.5,
|
||||
num_inference_steps=6,
|
||||
generator=torch.Generator("cpu").manual_seed(0),
|
||||
)
|
||||
frames = output.frames[0]
|
||||
export_to_gif(frames, "animatelcm-motion-lora.gif")
|
||||
```
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><center>
|
||||
A space rocket, 4K.
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatelcm-motion-lora.gif"
|
||||
alt="A space rocket, 4K"
|
||||
style="width: 300px;" />
|
||||
</center></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
## AnimateDiffPipeline
|
||||
|
||||
[[autodoc]] AnimateDiffPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## AnimateDiffSDXLPipeline
|
||||
|
||||
[[autodoc]] AnimateDiffSDXLPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## AnimateDiffVideoToVideoPipeline
|
||||
|
||||
[[autodoc]] AnimateDiffVideoToVideoPipeline
|
||||
|
||||
@@ -20,7 +20,8 @@ The abstract of the paper is the following:
|
||||
|
||||
*Although audio generation shares commonalities across different types of audio, such as speech, music, and sound effects, designing models for each type requires careful consideration of specific objectives and biases that can significantly differ from those of other types. To bring us closer to a unified perspective of audio generation, this paper proposes a framework that utilizes the same learning method for speech, music, and sound effect generation. Our framework introduces a general representation of audio, called "language of audio" (LOA). Any audio can be translated into LOA based on AudioMAE, a self-supervised pre-trained representation learning model. In the generation process, we translate any modalities into LOA by using a GPT-2 model, and we perform self-supervised audio generation learning with a latent diffusion model conditioned on LOA. The proposed framework naturally brings advantages such as in-context learning abilities and reusable self-supervised pretrained AudioMAE and latent diffusion models. Experiments on the major benchmarks of text-to-audio, text-to-music, and text-to-speech demonstrate state-of-the-art or competitive performance against previous approaches. Our code, pretrained model, and demo are available at [this https URL](https://audioldm.github.io/audioldm2).*
|
||||
|
||||
This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original codebase can be found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2).
|
||||
This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi) and [Nguyễn Công Tú Anh](https://github.com/tuanh123789). The original codebase can be
|
||||
found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2).
|
||||
|
||||
## Tips
|
||||
|
||||
@@ -36,6 +37,8 @@ See table below for details on the three checkpoints:
|
||||
| [audioldm2](https://huggingface.co/cvssp/audioldm2) | Text-to-audio | 350M | 1.1B | 1150k |
|
||||
| [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 750M | 1.5B | 1150k |
|
||||
| [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 350M | 1.1B | 665k |
|
||||
| [audioldm2-gigaspeech](https://huggingface.co/anhnct/audioldm2_gigaspeech) | Text-to-speech | 350M | 1.1B |10k |
|
||||
| [audioldm2-ljspeech](https://huggingface.co/anhnct/audioldm2_ljspeech) | Text-to-speech | 350M | 1.1B | |
|
||||
|
||||
### Constructing a prompt
|
||||
|
||||
@@ -53,7 +56,7 @@ See table below for details on the three checkpoints:
|
||||
* The quality of the generated waveforms can vary significantly based on the seed. Try generating with different seeds until you find a satisfactory generation.
|
||||
* Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.
|
||||
|
||||
The following example demonstrates how to construct good music generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).
|
||||
The following example demonstrates how to construct good music and speech generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).
|
||||
|
||||
<Tip>
|
||||
|
||||
|
||||
@@ -12,42 +12,10 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# AutoPipeline
|
||||
|
||||
`AutoPipeline` is designed to:
|
||||
|
||||
1. make it easy for you to load a checkpoint for a task without knowing the specific pipeline class to use
|
||||
2. use multiple pipelines in your workflow
|
||||
|
||||
Based on the task, the `AutoPipeline` class automatically retrieves the relevant pipeline given the name or path to the pretrained weights with the `from_pretrained()` method.
|
||||
|
||||
To seamlessly switch between tasks with the same checkpoint without reallocating additional memory, use the `from_pipe()` method to transfer the components from the original pipeline to the new one.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
image = pipeline(prompt, num_inference_steps=25).images[0]
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Check out the [AutoPipeline](../../tutorials/autopipeline) tutorial to learn how to use this API!
|
||||
|
||||
</Tip>
|
||||
|
||||
`AutoPipeline` supports text-to-image, image-to-image, and inpainting for the following diffusion models:
|
||||
|
||||
- [Stable Diffusion](./stable_diffusion/overview)
|
||||
- [ControlNet](./controlnet)
|
||||
- [Stable Diffusion XL (SDXL)](./stable_diffusion/stable_diffusion_xl)
|
||||
- [DeepFloyd IF](./deepfloyd_if)
|
||||
- [Kandinsky 2.1](./kandinsky)
|
||||
- [Kandinsky 2.2](./kandinsky_v22)
|
||||
The `AutoPipeline` is designed to make it easy to load a checkpoint for a task without needing to know the specific pipeline class. Based on the task, the `AutoPipeline` automatically retrieves the correct pipeline class from the checkpoint `model_index.json` file.
|
||||
|
||||
> [!TIP]
|
||||
> Check out the [AutoPipeline](../../tutorials/autopipeline) tutorial to learn how to use this API!
|
||||
|
||||
## AutoPipelineForText2Image
|
||||
|
||||
|
||||
@@ -1,3 +1,15 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# ControlNet-XS
|
||||
|
||||
ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
|
||||
@@ -12,5 +24,16 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe
|
||||
|
||||
This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
|
||||
|
||||
<Tip>
|
||||
|
||||
> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## StableDiffusionControlNetXSPipeline
|
||||
[[autodoc]] StableDiffusionControlNetXSPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableDiffusionPipelineOutput
|
||||
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
|
||||
@@ -1,3 +1,15 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# ControlNet-XS with Stable Diffusion XL
|
||||
|
||||
ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
|
||||
@@ -12,4 +24,22 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe
|
||||
|
||||
This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
|
||||
|
||||
> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
<Tip warning={true}>
|
||||
|
||||
🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve!
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## StableDiffusionXLControlNetXSPipeline
|
||||
[[autodoc]] StableDiffusionXLControlNetXSPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableDiffusionPipelineOutput
|
||||
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
|
||||
54
docs/source/en/api/pipelines/ledits_pp.md
Normal file
54
docs/source/en/api/pipelines/ledits_pp.md
Normal file
@@ -0,0 +1,54 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# LEDITS++
|
||||
|
||||
LEDITS++ was proposed in [LEDITS++: Limitless Image Editing using Text-to-Image Models](https://huggingface.co/papers/2311.16711) by Manuel Brack, Felix Friedrich, Katharina Kornmeier, Linoy Tsaban, Patrick Schramowski, Kristian Kersting, Apolinário Passos.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*Text-to-image diffusion models have recently received increasing interest for their astonishing ability to produce high-fidelity images from solely text inputs. Subsequent research efforts aim to exploit and apply their capabilities to real image editing. However, existing image-to-image methods are often inefficient, imprecise, and of limited versatility. They either require time-consuming fine-tuning, deviate unnecessarily strongly from the input image, and/or lack support for multiple, simultaneous edits. To address these issues, we introduce LEDITS++, an efficient yet versatile and precise textual image manipulation technique. LEDITS++'s novel inversion approach requires no tuning nor optimization and produces high-fidelity results with a few diffusion steps. Second, our methodology supports multiple simultaneous edits and is architecture-agnostic. Third, we use a novel implicit masking technique that limits changes to relevant image regions. We propose the novel TEdBench++ benchmark as part of our exhaustive evaluation. Our results demonstrate the capabilities of LEDITS++ and its improvements over previous methods. The project page is available at https://leditsplusplus-project.static.hf.space .*
|
||||
|
||||
<Tip>
|
||||
|
||||
You can find additional information about LEDITS++ on the [project page](https://leditsplusplus-project.static.hf.space/index.html) and try it out in a [demo](https://huggingface.co/spaces/editing-images/leditsplusplus).
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
Due to some backward compatability issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
|
||||
This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp).
|
||||
</Tip>
|
||||
|
||||
We provide two distinct pipelines based on different pre-trained models.
|
||||
|
||||
## LEditsPPPipelineStableDiffusion
|
||||
[[autodoc]] pipelines.ledits_pp.LEditsPPPipelineStableDiffusion
|
||||
- all
|
||||
- __call__
|
||||
- invert
|
||||
|
||||
## LEditsPPPipelineStableDiffusionXL
|
||||
[[autodoc]] pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL
|
||||
- all
|
||||
- __call__
|
||||
- invert
|
||||
|
||||
|
||||
|
||||
## LEditsPPDiffusionPipelineOutput
|
||||
[[autodoc]] pipelines.ledits_pp.pipeline_output.LEditsPPDiffusionPipelineOutput
|
||||
- all
|
||||
|
||||
## LEditsPPInversionPipelineOutput
|
||||
[[autodoc]] pipelines.ledits_pp.pipeline_output.LEditsPPInversionPipelineOutput
|
||||
- all
|
||||
@@ -57,6 +57,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
|
||||
| [Latent Consistency Models](latent_consistency_models) | text2image |
|
||||
| [Latent Diffusion](latent_diffusion) | text2image, super-resolution |
|
||||
| [LDM3D](stable_diffusion/ldm3d_diffusion) | text2image, text-to-3D, text-to-pano, upscaling |
|
||||
| [LEDITS++](ledits_pp) | image editing |
|
||||
| [MultiDiffusion](panorama) | text2image |
|
||||
| [MusicLDM](musicldm) | text2audio |
|
||||
| [Paint by Example](paint_by_example) | inpainting |
|
||||
@@ -96,6 +97,11 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
|
||||
- to
|
||||
- components
|
||||
|
||||
|
||||
[[autodoc]] pipelines.StableDiffusionMixin.enable_freeu
|
||||
|
||||
[[autodoc]] pipelines.StableDiffusionMixin.disable_freeu
|
||||
|
||||
## FlaxDiffusionPipeline
|
||||
|
||||
[[autodoc]] pipelines.pipeline_flax_utils.FlaxDiffusionPipeline
|
||||
|
||||
@@ -30,6 +30,6 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableDiffusionSafePipelineOutput
|
||||
## SemanticStableDiffusionPipelineOutput
|
||||
[[autodoc]] pipelines.semantic_stable_diffusion.pipeline_output.SemanticStableDiffusionPipelineOutput
|
||||
- all
|
||||
|
||||
229
docs/source/en/api/pipelines/stable_cascade.md
Normal file
229
docs/source/en/api/pipelines/stable_cascade.md
Normal file
@@ -0,0 +1,229 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Stable Cascade
|
||||
|
||||
This model is built upon the [Würstchen](https://openreview.net/forum?id=gU58d5QeGv) architecture and its main
|
||||
difference to other models like Stable Diffusion is that it is working at a much smaller latent space. Why is this
|
||||
important? The smaller the latent space, the **faster** you can run inference and the **cheaper** the training becomes.
|
||||
How small is the latent space? Stable Diffusion uses a compression factor of 8, resulting in a 1024x1024 image being
|
||||
encoded to 128x128. Stable Cascade achieves a compression factor of 42, meaning that it is possible to encode a
|
||||
1024x1024 image to 24x24, while maintaining crisp reconstructions. The text-conditional model is then trained in the
|
||||
highly compressed latent space. Previous versions of this architecture, achieved a 16x cost reduction over Stable
|
||||
Diffusion 1.5.
|
||||
|
||||
Therefore, this kind of model is well suited for usages where efficiency is important. Furthermore, all known extensions
|
||||
like finetuning, LoRA, ControlNet, IP-Adapter, LCM etc. are possible with this method as well.
|
||||
|
||||
The original codebase can be found at [Stability-AI/StableCascade](https://github.com/Stability-AI/StableCascade).
|
||||
|
||||
## Model Overview
|
||||
Stable Cascade consists of three models: Stage A, Stage B and Stage C, representing a cascade to generate images,
|
||||
hence the name "Stable Cascade".
|
||||
|
||||
Stage A & B are used to compress images, similar to what the job of the VAE is in Stable Diffusion.
|
||||
However, with this setup, a much higher compression of images can be achieved. While the Stable Diffusion models use a
|
||||
spatial compression factor of 8, encoding an image with resolution of 1024 x 1024 to 128 x 128, Stable Cascade achieves
|
||||
a compression factor of 42. This encodes a 1024 x 1024 image to 24 x 24, while being able to accurately decode the
|
||||
image. This comes with the great benefit of cheaper training and inference. Furthermore, Stage C is responsible
|
||||
for generating the small 24 x 24 latents given a text prompt.
|
||||
|
||||
The Stage C model operates on the small 24 x 24 latents and denoises the latents conditioned on text prompts. The model is also the largest component in the Cascade pipeline and is meant to be used with the `StableCascadePriorPipeline`
|
||||
|
||||
The Stage B and Stage A models are used with the `StableCascadeDecoderPipeline` and are responsible for generating the final image given the small 24 x 24 latents.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
There are some restrictions on data types that can be used with the Stable Cascade models. The official checkpoints for the `StableCascadePriorPipeline` do not support the `torch.float16` data type. Please use `torch.bfloat16` instead.
|
||||
|
||||
In order to use the `torch.bfloat16` data type with the `StableCascadeDecoderPipeline` you need to have PyTorch 2.2.0 or higher installed. This also means that using the `StableCascadeCombinedPipeline` with `torch.bfloat16` requires PyTorch 2.2.0 or higher, since it calls the `StableCascadeDecoderPipeline` internally.
|
||||
|
||||
If it is not possible to install PyTorch 2.2.0 or higher in your environment, the `StableCascadeDecoderPipeline` can be used on its own with the `torch.float16` data type. You can download the full precision or `bf16` variant weights for the pipeline and cast the weights to `torch.float16`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Usage example
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline
|
||||
|
||||
prompt = "an image of a shiba inu, donning a spacesuit and helmet"
|
||||
negative_prompt = ""
|
||||
|
||||
prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=torch.bfloat16)
|
||||
decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.float16)
|
||||
|
||||
prior.enable_model_cpu_offload()
|
||||
prior_output = prior(
|
||||
prompt=prompt,
|
||||
height=1024,
|
||||
width=1024,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=4.0,
|
||||
num_images_per_prompt=1,
|
||||
num_inference_steps=20
|
||||
)
|
||||
|
||||
decoder.enable_model_cpu_offload()
|
||||
decoder_output = decoder(
|
||||
image_embeddings=prior_output.image_embeddings.to(torch.float16),
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=0.0,
|
||||
output_type="pil",
|
||||
num_inference_steps=10
|
||||
).images[0]
|
||||
decoder_output.save("cascade.png")
|
||||
```
|
||||
|
||||
## Using the Lite Versions of the Stage B and Stage C models
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import (
|
||||
StableCascadeDecoderPipeline,
|
||||
StableCascadePriorPipeline,
|
||||
StableCascadeUNet,
|
||||
)
|
||||
|
||||
prompt = "an image of a shiba inu, donning a spacesuit and helmet"
|
||||
negative_prompt = ""
|
||||
|
||||
prior_unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade-prior", subfolder="prior_lite")
|
||||
decoder_unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade", subfolder="decoder_lite")
|
||||
|
||||
prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", prior=prior_unet)
|
||||
decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", decoder=decoder_unet)
|
||||
|
||||
prior.enable_model_cpu_offload()
|
||||
prior_output = prior(
|
||||
prompt=prompt,
|
||||
height=1024,
|
||||
width=1024,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=4.0,
|
||||
num_images_per_prompt=1,
|
||||
num_inference_steps=20
|
||||
)
|
||||
|
||||
decoder.enable_model_cpu_offload()
|
||||
decoder_output = decoder(
|
||||
image_embeddings=prior_output.image_embeddings,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=0.0,
|
||||
output_type="pil",
|
||||
num_inference_steps=10
|
||||
).images[0]
|
||||
decoder_output.save("cascade.png")
|
||||
```
|
||||
|
||||
## Loading original checkpoints with `from_single_file`
|
||||
|
||||
Loading the original format checkpoints is supported via `from_single_file` method in the StableCascadeUNet.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import (
|
||||
StableCascadeDecoderPipeline,
|
||||
StableCascadePriorPipeline,
|
||||
StableCascadeUNet,
|
||||
)
|
||||
|
||||
prompt = "an image of a shiba inu, donning a spacesuit and helmet"
|
||||
negative_prompt = ""
|
||||
|
||||
prior_unet = StableCascadeUNet.from_single_file(
|
||||
"https://huggingface.co/stabilityai/stable-cascade/resolve/main/stage_c_bf16.safetensors",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
decoder_unet = StableCascadeUNet.from_single_file(
|
||||
"https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_bf16.safetensors",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", prior=prior_unet, torch_dtype=torch.bfloat16)
|
||||
decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", decoder=decoder_unet, torch_dtype=torch.bfloat16)
|
||||
|
||||
prior.enable_model_cpu_offload()
|
||||
prior_output = prior(
|
||||
prompt=prompt,
|
||||
height=1024,
|
||||
width=1024,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=4.0,
|
||||
num_images_per_prompt=1,
|
||||
num_inference_steps=20
|
||||
)
|
||||
|
||||
decoder.enable_model_cpu_offload()
|
||||
decoder_output = decoder(
|
||||
image_embeddings=prior_output.image_embeddings,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=0.0,
|
||||
output_type="pil",
|
||||
num_inference_steps=10
|
||||
).images[0]
|
||||
decoder_output.save("cascade-single-file.png")
|
||||
```
|
||||
|
||||
## Uses
|
||||
|
||||
### Direct Use
|
||||
|
||||
The model is intended for research purposes for now. Possible research areas and tasks include
|
||||
|
||||
- Research on generative models.
|
||||
- Safe deployment of models which have the potential to generate harmful content.
|
||||
- Probing and understanding the limitations and biases of generative models.
|
||||
- Generation of artworks and use in design and other artistic processes.
|
||||
- Applications in educational or creative tools.
|
||||
|
||||
Excluded uses are described below.
|
||||
|
||||
### Out-of-Scope Use
|
||||
|
||||
The model was not trained to be factual or true representations of people or events,
|
||||
and therefore using the model to generate such content is out-of-scope for the abilities of this model.
|
||||
The model should not be used in any way that violates Stability AI's [Acceptable Use Policy](https://stability.ai/use-policy).
|
||||
|
||||
## Limitations and Bias
|
||||
|
||||
### Limitations
|
||||
- Faces and people in general may not be generated properly.
|
||||
- The autoencoding part of the model is lossy.
|
||||
|
||||
|
||||
## StableCascadeCombinedPipeline
|
||||
|
||||
[[autodoc]] StableCascadeCombinedPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableCascadePriorPipeline
|
||||
|
||||
[[autodoc]] StableCascadePriorPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableCascadePriorPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.stable_cascade.pipeline_stable_cascade_prior.StableCascadePriorPipelineOutput
|
||||
|
||||
## StableCascadeDecoderPipeline
|
||||
|
||||
[[autodoc]] StableCascadeDecoderPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
@@ -10,9 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Text-to-Image Generation with Adapter Conditioning
|
||||
|
||||
## Overview
|
||||
# T2I-Adapter
|
||||
|
||||
[T2I-Adapter: Learning Adapters to Dig out More Controllable Ability for Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.08453) by Chong Mou, Xintao Wang, Liangbin Xie, Jian Zhang, Zhongang Qi, Ying Shan, Xiaohu Qie.
|
||||
|
||||
@@ -24,236 +22,26 @@ The abstract of the paper is the following:
|
||||
|
||||
This model was contributed by the community contributor [HimariO](https://github.com/HimariO) ❤️ .
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Demo
|
||||
|---|---|:---:|
|
||||
| [StableDiffusionAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning* | -
|
||||
| [StableDiffusionXLAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning on StableDiffusion-XL* | -
|
||||
|
||||
## Usage example with the base model of StableDiffusion-1.4/1.5
|
||||
|
||||
In the following we give a simple example of how to use a *T2I-Adapter* checkpoint with Diffusers for inference based on StableDiffusion-1.4/1.5.
|
||||
All adapters use the same pipeline.
|
||||
|
||||
1. Images are first converted into the appropriate *control image* format.
|
||||
2. The *control image* and *prompt* are passed to the [`StableDiffusionAdapterPipeline`].
|
||||
|
||||
Let's have a look at a simple example using the [Color Adapter](https://huggingface.co/TencentARC/t2iadapter_color_sd14v1).
|
||||
|
||||
```python
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_ref.png")
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
Then we can create our color palette by simply resizing it to 8 by 8 pixels and then scaling it back to original size.
|
||||
|
||||
```python
|
||||
from PIL import Image
|
||||
|
||||
color_palette = image.resize((8, 8))
|
||||
color_palette = color_palette.resize((512, 512), resample=Image.Resampling.NEAREST)
|
||||
```
|
||||
|
||||
Let's take a look at the processed image.
|
||||
|
||||

|
||||
|
||||
|
||||
Next, create the adapter pipeline
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
|
||||
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_color_sd14v1", torch_dtype=torch.float16)
|
||||
pipe = StableDiffusionAdapterPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
adapter=adapter,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe.to("cuda")
|
||||
```
|
||||
|
||||
Finally, pass the prompt and control image to the pipeline
|
||||
|
||||
```py
|
||||
# fix the random seed, so you will get the same result as the example
|
||||
generator = torch.Generator("cuda").manual_seed(7)
|
||||
|
||||
out_image = pipe(
|
||||
"At night, glowing cubes in front of the beach",
|
||||
image=color_palette,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
make_image_grid([image, color_palette, out_image], rows=1, cols=3)
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Usage example with the base model of StableDiffusion-XL
|
||||
|
||||
In the following we give a simple example of how to use a *T2I-Adapter* checkpoint with Diffusers for inference based on StableDiffusion-XL.
|
||||
All adapters use the same pipeline.
|
||||
|
||||
1. Images are first downloaded into the appropriate *control image* format.
|
||||
2. The *control image* and *prompt* are passed to the [`StableDiffusionXLAdapterPipeline`].
|
||||
|
||||
Let's have a look at a simple example using the [Sketch Adapter](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0).
|
||||
|
||||
```python
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
sketch_image = load_image("https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png").convert("L")
|
||||
```
|
||||
|
||||

|
||||
|
||||
Then, create the adapter pipeline
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import (
|
||||
T2IAdapter,
|
||||
StableDiffusionXLAdapterPipeline,
|
||||
DDPMScheduler
|
||||
)
|
||||
|
||||
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
adapter = T2IAdapter.from_pretrained("Adapter/t2iadapter", subfolder="sketch_sdxl_1.0", torch_dtype=torch.float16, adapter_type="full_adapter_xl")
|
||||
scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
|
||||
|
||||
pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
|
||||
model_id, adapter=adapter, safety_checker=None, torch_dtype=torch.float16, variant="fp16", scheduler=scheduler
|
||||
)
|
||||
|
||||
pipe.to("cuda")
|
||||
```
|
||||
|
||||
Finally, pass the prompt and control image to the pipeline
|
||||
|
||||
```py
|
||||
# fix the random seed, so you will get the same result as the example
|
||||
generator = torch.Generator().manual_seed(42)
|
||||
|
||||
sketch_image_out = pipe(
|
||||
prompt="a photo of a dog in real world, high quality",
|
||||
negative_prompt="extra digit, fewer digits, cropped, worst quality, low quality",
|
||||
image=sketch_image,
|
||||
generator=generator,
|
||||
guidance_scale=7.5
|
||||
).images[0]
|
||||
make_image_grid([sketch_image, sketch_image_out], rows=1, cols=2)
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Available checkpoints
|
||||
|
||||
Non-diffusers checkpoints can be found under [TencentARC/T2I-Adapter](https://huggingface.co/TencentARC/T2I-Adapter/tree/main/models).
|
||||
|
||||
### T2I-Adapter with Stable Diffusion 1.4
|
||||
|
||||
| Model Name | Control Image Overview| Control Image Example | Generated Image Example |
|
||||
|---|---|---|---|
|
||||
|[TencentARC/t2iadapter_color_sd14v1](https://huggingface.co/TencentARC/t2iadapter_color_sd14v1)<br/> *Trained with spatial color palette* | An image with 8x8 color palette.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_canny_sd14v1](https://huggingface.co/TencentARC/t2iadapter_canny_sd14v1)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/canny_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_sketch_sd14v1](https://huggingface.co/TencentARC/t2iadapter_sketch_sd14v1)<br/> *Trained with [PidiNet](https://github.com/zhuoinoulu/pidinet) edge detection* | A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_input.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/sketch_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_depth_sd14v1](https://huggingface.co/TencentARC/t2iadapter_depth_sd14v1)<br/> *Trained with Midas depth estimation* | A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_openpose_sd14v1](https://huggingface.co/TencentARC/t2iadapter_openpose_sd14v1)<br/> *Trained with OpenPose bone image* | A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/openpose_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_keypose_sd14v1](https://huggingface.co/TencentARC/t2iadapter_keypose_sd14v1)<br/> *Trained with mmpose skeleton image* | A [mmpose skeleton](https://github.com/open-mmlab/mmpose) image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_output.png"/></a>|
|
||||
|[TencentARC/t2iadapter_seg_sd14v1](https://huggingface.co/TencentARC/t2iadapter_seg_sd14v1)<br/>*Trained with semantic segmentation* | An [custom](https://github.com/TencentARC/T2I-Adapter/discussions/25) segmentation protocol image.|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_input.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_input.png"/></a>|<a href="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_output.png"><img width="64" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/seg_sample_output.png"/></a> |
|
||||
|[TencentARC/t2iadapter_canny_sd15v2](https://huggingface.co/TencentARC/t2iadapter_canny_sd15v2)||
|
||||
|[TencentARC/t2iadapter_depth_sd15v2](https://huggingface.co/TencentARC/t2iadapter_depth_sd15v2)||
|
||||
|[TencentARC/t2iadapter_sketch_sd15v2](https://huggingface.co/TencentARC/t2iadapter_sketch_sd15v2)||
|
||||
|[TencentARC/t2iadapter_zoedepth_sd15v1](https://huggingface.co/TencentARC/t2iadapter_zoedepth_sd15v1)||
|
||||
|[Adapter/t2iadapter, subfolder='sketch_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0)||
|
||||
|[Adapter/t2iadapter, subfolder='canny_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/canny_sdxl_1.0)||
|
||||
|[Adapter/t2iadapter, subfolder='openpose_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/openpose_sdxl_1.0)||
|
||||
|
||||
## Combining multiple adapters
|
||||
|
||||
[`MultiAdapter`] can be used for applying multiple conditionings at once.
|
||||
|
||||
Here we use the keypose adapter for the character posture and the depth adapter for creating the scene.
|
||||
|
||||
```py
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
cond_keypose = load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"
|
||||
)
|
||||
cond_depth = load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"
|
||||
)
|
||||
cond = [cond_keypose, cond_depth]
|
||||
|
||||
prompt = ["A man walking in an office room with a nice view"]
|
||||
```
|
||||
|
||||
The two control images look as such:
|
||||
|
||||

|
||||

|
||||
|
||||
|
||||
`MultiAdapter` combines keypose and depth adapters.
|
||||
|
||||
`adapter_conditioning_scale` balances the relative influence of the different adapters.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionAdapterPipeline, MultiAdapter, T2IAdapter
|
||||
|
||||
adapters = MultiAdapter(
|
||||
[
|
||||
T2IAdapter.from_pretrained("TencentARC/t2iadapter_keypose_sd14v1"),
|
||||
T2IAdapter.from_pretrained("TencentARC/t2iadapter_depth_sd14v1"),
|
||||
]
|
||||
)
|
||||
adapters = adapters.to(torch.float16)
|
||||
|
||||
pipe = StableDiffusionAdapterPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
torch_dtype=torch.float16,
|
||||
adapter=adapters,
|
||||
).to("cuda")
|
||||
|
||||
image = pipe(prompt, cond, adapter_conditioning_scale=[0.8, 0.8]).images[0]
|
||||
make_image_grid([cond_keypose, cond_depth, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
## T2I-Adapter vs ControlNet
|
||||
|
||||
T2I-Adapter is similar to [ControlNet](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet).
|
||||
T2I-Adapter uses a smaller auxiliary network which is only run once for the entire diffusion process.
|
||||
However, T2I-Adapter performs slightly worse than ControlNet.
|
||||
|
||||
## StableDiffusionAdapterPipeline
|
||||
|
||||
[[autodoc]] StableDiffusionAdapterPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
## StableDiffusionXLAdapterPipeline
|
||||
|
||||
[[autodoc]] StableDiffusionXLAdapterPipeline
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
- all
|
||||
- __call__
|
||||
- enable_attention_slicing
|
||||
- disable_attention_slicing
|
||||
- enable_vae_slicing
|
||||
- disable_vae_slicing
|
||||
- enable_xformers_memory_efficient_attention
|
||||
- disable_xformers_memory_efficient_attention
|
||||
|
||||
@@ -172,3 +172,41 @@ inpaint = StableDiffusionInpaintPipeline(**text2img.components)
|
||||
|
||||
# now you can use text2img(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
|
||||
```
|
||||
|
||||
### Create web demos using `gradio`
|
||||
|
||||
The Stable Diffusion pipelines are automatically supported in [Gradio](https://github.com/gradio-app/gradio/), a library that makes creating beautiful and user-friendly machine learning apps on the web a breeze. First, make sure you have Gradio installed:
|
||||
|
||||
```
|
||||
pip install -U gradio
|
||||
```
|
||||
|
||||
Then, create a web demo around any Stable Diffusion-based pipeline. For example, you can create an image generation pipeline in a single line of code with Gradio's [`Interface.from_pipeline`](https://www.gradio.app/docs/interface#interface-from-pipeline) function:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import gradio as gr
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
|
||||
gr.Interface.from_pipeline(pipe).launch()
|
||||
```
|
||||
|
||||
which opens an intuitive drag-and-drop interface in your browser:
|
||||
|
||||

|
||||
|
||||
Similarly, you could create a demo for an image-to-image pipeline with:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionImg2ImgPipeline
|
||||
import gradio as gr
|
||||
|
||||
|
||||
pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
|
||||
gr.Interface.from_pipeline(pipe).launch()
|
||||
```
|
||||
|
||||
By default, the web demo runs on a local server. If you'd like to share it with others, you can generate a temporary public
|
||||
link by setting `share=True` in `launch()`. Or, you can host your demo on [Hugging Face Spaces](https://huggingface.co/spaces)https://huggingface.co/spaces for a permanent link.
|
||||
@@ -21,7 +21,7 @@ The abstract from the paper is:
|
||||
## Tips
|
||||
|
||||
- SDXL Turbo uses the exact same architecture as [SDXL](./stable_diffusion_xl), which means it also has the same API. Please refer to the [SDXL](./stable_diffusion_xl) API reference for more details.
|
||||
- SDXL Turbo should disable guidance scale by setting `guidance_scale=0.0`
|
||||
- SDXL Turbo should disable guidance scale by setting `guidance_scale=0.0`.
|
||||
- SDXL Turbo should use `timestep_spacing='trailing'` for the scheduler and use between 1 and 4 steps.
|
||||
- SDXL Turbo has been trained to generate images of size 512x512.
|
||||
- SDXL Turbo is open-access, but not open-source meaning that one might have to buy a model license in order to use it for commercial applications. Make sure to read the [official model card](https://huggingface.co/stabilityai/sdxl-turbo) to learn more.
|
||||
|
||||
43
docs/source/en/api/pipelines/stable_diffusion/svd.md
Normal file
43
docs/source/en/api/pipelines/stable_diffusion/svd.md
Normal file
@@ -0,0 +1,43 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Stable Video Diffusion
|
||||
|
||||
Stable Video Diffusion was proposed in [Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets](https://hf.co/papers/2311.15127) by Andreas Blattmann, Tim Dockhorn, Sumith Kulal, Daniel Mendelevitch, Maciej Kilian, Dominik Lorenz, Yam Levi, Zion English, Vikram Voleti, Adam Letts, Varun Jampani, Robin Rombach.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*We present Stable Video Diffusion - a latent video diffusion model for high-resolution, state-of-the-art text-to-video and image-to-video generation. Recently, latent diffusion models trained for 2D image synthesis have been turned into generative video models by inserting temporal layers and finetuning them on small, high-quality video datasets. However, training methods in the literature vary widely, and the field has yet to agree on a unified strategy for curating video data. In this paper, we identify and evaluate three different stages for successful training of video LDMs: text-to-image pretraining, video pretraining, and high-quality video finetuning. Furthermore, we demonstrate the necessity of a well-curated pretraining dataset for generating high-quality videos and present a systematic curation process to train a strong base model, including captioning and filtering strategies. We then explore the impact of finetuning our base model on high-quality data and train a text-to-video model that is competitive with closed-source video generation. We also show that our base model provides a powerful motion representation for downstream tasks such as image-to-video generation and adaptability to camera motion-specific LoRA modules. Finally, we demonstrate that our model provides a strong multi-view 3D-prior and can serve as a base to finetune a multi-view diffusion model that jointly generates multiple views of objects in a feedforward fashion, outperforming image-based methods at a fraction of their compute budget. We release code and model weights at this https URL.*
|
||||
|
||||
<Tip>
|
||||
|
||||
To learn how to use Stable Video Diffusion, take a look at the [Stable Video Diffusion](../../../using-diffusers/svd) guide.
|
||||
|
||||
<br>
|
||||
|
||||
Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the [base](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [extended frame](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) checkpoints!
|
||||
|
||||
</Tip>
|
||||
|
||||
## Tips
|
||||
|
||||
Video generation is memory-intensive and one way to reduce your memory usage is to set `enable_forward_chunking` on the pipeline's UNet so you don't run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.
|
||||
|
||||
Check out the [Text or image-to-video](text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
|
||||
|
||||
## StableVideoDiffusionPipeline
|
||||
|
||||
[[autodoc]] StableVideoDiffusionPipeline
|
||||
|
||||
## StableVideoDiffusionPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput
|
||||
@@ -167,6 +167,12 @@ Here are some sample outputs:
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## Tips
|
||||
|
||||
Video generation is memory-intensive and one way to reduce your memory usage is to set `enable_forward_chunking` on the pipeline's UNet so you don't run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.
|
||||
|
||||
Check out the [Text or image-to-video](text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
@@ -1,9 +1,21 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# ConsistencyDecoderScheduler
|
||||
|
||||
This scheduler is a part of the [`ConsistencyDecoderPipeline`] and was introduced in [DALL-E 3](https://openai.com/dall-e-3).
|
||||
This scheduler is a part of the [`ConsistencyDecoderPipeline`] and was introduced in [DALL-E 3](https://openai.com/dall-e-3).
|
||||
|
||||
The original codebase can be found at [openai/consistency_models](https://github.com/openai/consistency_models).
|
||||
|
||||
|
||||
## ConsistencyDecoderScheduler
|
||||
[[autodoc]] schedulers.scheduling_consistency_decoder.ConsistencyDecoderScheduler
|
||||
[[autodoc]] schedulers.scheduling_consistency_decoder.ConsistencyDecoderScheduler
|
||||
|
||||
22
docs/source/en/api/schedulers/edm_euler.md
Normal file
22
docs/source/en/api/schedulers/edm_euler.md
Normal file
@@ -0,0 +1,22 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# EDMEulerScheduler
|
||||
|
||||
The Karras formulation of the Euler scheduler (Algorithm 2) from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper by Karras et al. This is a fast scheduler which can often generate good outputs in 20-30 steps. The scheduler is based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by [Katherine Crowson](https://github.com/crowsonkb/).
|
||||
|
||||
|
||||
## EDMEulerScheduler
|
||||
[[autodoc]] EDMEulerScheduler
|
||||
|
||||
## EDMEulerSchedulerOutput
|
||||
[[autodoc]] schedulers.scheduling_edm_euler.EDMEulerSchedulerOutput
|
||||
24
docs/source/en/api/schedulers/edm_multistep_dpm_solver.md
Normal file
24
docs/source/en/api/schedulers/edm_multistep_dpm_solver.md
Normal file
@@ -0,0 +1,24 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# EDMDPMSolverMultistepScheduler
|
||||
|
||||
`EDMDPMSolverMultistepScheduler` is a [Karras formulation](https://huggingface.co/papers/2206.00364) of `DPMSolverMultistep`, a multistep scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
|
||||
|
||||
DPMSolver (and the improved version DPMSolver++) is a fast dedicated high-order solver for diffusion ODEs with convergence order guarantee. Empirically, DPMSolver sampling with only 20 steps can generate high-quality
|
||||
samples, and it can generate quite good samples even in 10 steps.
|
||||
|
||||
## EDMDPMSolverMultistepScheduler
|
||||
[[autodoc]] EDMDPMSolverMultistepScheduler
|
||||
|
||||
## SchedulerOutput
|
||||
[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
|
||||
29
docs/source/en/api/schedulers/tcd.md
Normal file
29
docs/source/en/api/schedulers/tcd.md
Normal file
@@ -0,0 +1,29 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# TCDScheduler
|
||||
|
||||
[Trajectory Consistency Distillation](https://huggingface.co/papers/2402.19159) by Jianbin Zheng, Minghui Hu, Zhongyi Fan, Chaoyue Wang, Changxing Ding, Dacheng Tao and Tat-Jen Cham introduced a Strategic Stochastic Sampling (Algorithm 4) that is capable of generating good samples in a small number of steps. Distinguishing it as an advanced iteration of the multistep scheduler (Algorithm 1) in the [Consistency Models](https://huggingface.co/papers/2303.01469), Strategic Stochastic Sampling specifically tailored for the trajectory consistency function.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*Latent Consistency Model (LCM) extends the Consistency Model to the latent space and leverages the guided consistency distillation technique to achieve impressive performance in accelerating text-to-image synthesis. However, we observed that LCM struggles to generate images with both clarity and detailed intricacy. To address this limitation, we initially delve into and elucidate the underlying causes. Our investigation identifies that the primary issue stems from errors in three distinct areas. Consequently, we introduce Trajectory Consistency Distillation (TCD), which encompasses trajectory consistency function and strategic stochastic sampling. The trajectory consistency function diminishes the distillation errors by broadening the scope of the self-consistency boundary condition and endowing the TCD with the ability to accurately trace the entire trajectory of the Probability Flow ODE. Additionally, strategic stochastic sampling is specifically designed to circumvent the accumulated errors inherent in multi-step consistency sampling, which is meticulously tailored to complement the TCD model. Experiments demonstrate that TCD not only significantly enhances image quality at low NFEs but also yields more detailed results compared to the teacher model at high NFEs.*
|
||||
|
||||
The original codebase can be found at [jabir-zheng/TCD](https://github.com/jabir-zheng/TCD).
|
||||
|
||||
## TCDScheduler
|
||||
[[autodoc]] TCDScheduler
|
||||
|
||||
|
||||
## TCDSchedulerOutput
|
||||
[[autodoc]] schedulers.scheduling_tcd.TCDSchedulerOutput
|
||||
|
||||
@@ -37,3 +37,7 @@ Utility and helper functions for working with 🤗 Diffusers.
|
||||
## make_image_grid
|
||||
|
||||
[[autodoc]] utils.make_image_grid
|
||||
|
||||
## randn_tensor
|
||||
|
||||
[[autodoc]] utils.torch_utils.randn_tensor
|
||||
|
||||
15
docs/source/en/api/video_processor.md
Normal file
15
docs/source/en/api/video_processor.md
Normal file
@@ -0,0 +1,15 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Video Processor
|
||||
|
||||
The `VideoProcessor` provides a unified API for video pipelines to prepare inputs for VAE encoding and post-processing outputs once they're decoded. The class inherits [`VaeImageProcessor`] so it includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays.
|
||||
@@ -198,38 +198,81 @@ Anything displayed on [the official Diffusers doc page](https://huggingface.co/d
|
||||
|
||||
Please have a look at [this page](https://github.com/huggingface/diffusers/tree/main/docs) on how to verify changes made to the documentation locally.
|
||||
|
||||
|
||||
### 6. Contribute a community pipeline
|
||||
|
||||
[Pipelines](https://huggingface.co/docs/diffusers/api/pipelines/overview) are usually the first point of contact between the Diffusers library and the user.
|
||||
Pipelines are examples of how to use Diffusers [models](https://huggingface.co/docs/diffusers/api/models/overview) and [schedulers](https://huggingface.co/docs/diffusers/api/schedulers/overview).
|
||||
We support two types of pipelines:
|
||||
> [!TIP]
|
||||
> Read the [Community pipelines](../using-diffusers/custom_pipeline_overview#community-pipelines) guide to learn more about the difference between a GitHub and Hugging Face Hub community pipeline. If you're interested in why we have community pipelines, take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) (basically, we can't maintain all the possible ways diffusion models can be used for inference but we also don't want to prevent the community from building them).
|
||||
|
||||
- Official Pipelines
|
||||
- Community Pipelines
|
||||
Contributing a community pipeline is a great way to share your creativity and work with the community. It lets you build on top of the [`DiffusionPipeline`] so that anyone can load and use it by setting the `custom_pipeline` parameter. This section will walk you through how to create a simple pipeline where the UNet only does a single forward pass and calls the scheduler once (a "one-step" pipeline).
|
||||
|
||||
Both official and community pipelines follow the same design and consist of the same type of components.
|
||||
1. Create a one_step_unet.py file for your community pipeline. This file can contain whatever package you want to use as long as it's installed by the user. Make sure you only have one pipeline class that inherits from [`DiffusionPipeline`] to load model weights and the scheduler configuration from the Hub. Add a UNet and scheduler to the `__init__` function.
|
||||
|
||||
Official pipelines are tested and maintained by the core maintainers of Diffusers. Their code
|
||||
resides in [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
|
||||
In contrast, community pipelines are contributed and maintained purely by the **community** and are **not** tested.
|
||||
They reside in [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and while they can be accessed via the [PyPI diffusers package](https://pypi.org/project/diffusers/), their code is not part of the PyPI distribution.
|
||||
You should also add the `register_modules` function to ensure your pipeline and its components can be saved with [`~DiffusionPipeline.save_pretrained`].
|
||||
|
||||
The reason for the distinction is that the core maintainers of the Diffusers library cannot maintain and test all
|
||||
possible ways diffusion models can be used for inference, but some of them may be of interest to the community.
|
||||
Officially released diffusion pipelines,
|
||||
such as Stable Diffusion are added to the core src/diffusers/pipelines package which ensures
|
||||
high quality of maintenance, no backward-breaking code changes, and testing.
|
||||
More bleeding edge pipelines should be added as community pipelines. If usage for a community pipeline is high, the pipeline can be moved to the official pipelines upon request from the community. This is one of the ways we strive to be a community-driven library.
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
To add a community pipeline, one should add a <name-of-the-community>.py file to [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and adapt the [examples/community/README.md](https://github.com/huggingface/diffusers/tree/main/examples/community/README.md) to include an example of the new pipeline.
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
An example can be seen [here](https://github.com/huggingface/diffusers/pull/2400).
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
```
|
||||
|
||||
Community pipeline PRs are only checked at a superficial level and ideally they should be maintained by their original authors.
|
||||
1. In the forward pass (which we recommend defining as `__call__`), you can add any feature you'd like. For the "one-step" pipeline, create a random image and call the UNet and scheduler once by setting `timestep=1`.
|
||||
|
||||
Contributing a community pipeline is a great way to understand how Diffusers models and schedulers work. Having contributed a community pipeline is usually the first stepping stone to contributing an official pipeline to the
|
||||
core package.
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
|
||||
def __call__(self):
|
||||
image = torch.randn(
|
||||
(1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
|
||||
)
|
||||
timestep = 1
|
||||
|
||||
model_output = self.unet(image, timestep).sample
|
||||
scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
|
||||
|
||||
return scheduler_output
|
||||
```
|
||||
|
||||
Now you can run the pipeline by passing a UNet and scheduler to it or load pretrained weights if the pipeline structure is identical.
|
||||
|
||||
```py
|
||||
from diffusers import DDPMScheduler, UNet2DModel
|
||||
|
||||
scheduler = DDPMScheduler()
|
||||
unet = UNet2DModel()
|
||||
|
||||
pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
|
||||
output = pipeline()
|
||||
# load pretrained weights
|
||||
pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
output = pipeline()
|
||||
```
|
||||
|
||||
You can either share your pipeline as a GitHub community pipeline or Hub community pipeline.
|
||||
|
||||
<hfoptions id="pipeline type">
|
||||
<hfoption id="GitHub pipeline">
|
||||
|
||||
Share your GitHub pipeline by opening a pull request on the Diffusers [repository](https://github.com/huggingface/diffusers) and add the one_step_unet.py file to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder.
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Hub pipeline">
|
||||
|
||||
Share your Hub pipeline by creating a model repository on the Hub and uploading the one_step_unet.py file to it.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### 7. Contribute to training examples
|
||||
|
||||
|
||||
@@ -112,7 +112,7 @@ pip install -e ".[flax]"
|
||||
|
||||
These commands will link the folder you cloned the repository to and your Python library paths.
|
||||
Python will now look inside the folder you cloned to in addition to the normal library paths.
|
||||
For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.8/site-packages/`, Python will also search the `~/diffusers/` folder you cloned to.
|
||||
For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.10/site-packages/`, Python will also search the `~/diffusers/` folder you cloned to.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
|
||||
@@ -12,27 +12,23 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Speed up inference
|
||||
|
||||
There are several ways to optimize 🤗 Diffusers for inference speed. As a general rule of thumb, we recommend using either [xFormers](xformers) or `torch.nn.functional.scaled_dot_product_attention` in PyTorch 2.0 for their memory-efficient attention.
|
||||
There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attetntion](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.
|
||||
|
||||
<Tip>
|
||||
> [!TIP]
|
||||
> Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide.
|
||||
|
||||
In many cases, optimizing for speed or memory leads to improved performance in the other, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about preserving memory in the [Reduce memory usage](memory) guide.
|
||||
The inference times below are obtained from generating a single 512x512 image from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps on a NVIDIA A100.
|
||||
|
||||
</Tip>
|
||||
| setup | latency | speed-up |
|
||||
|----------|---------|----------|
|
||||
| baseline | 5.27s | x1 |
|
||||
| tf32 | 4.14s | x1.27 |
|
||||
| fp16 | 3.51s | x1.50 |
|
||||
| combined | 3.41s | x1.54 |
|
||||
|
||||
The results below are obtained from generating a single 512x512 image from the prompt `a photo of an astronaut riding a horse on mars` with 50 DDIM steps on a Nvidia Titan RTX, demonstrating the speed-up you can expect.
|
||||
## TensorFloat-32
|
||||
|
||||
| | latency | speed-up |
|
||||
| ---------------- | ------- | ------- |
|
||||
| original | 9.50s | x1 |
|
||||
| fp16 | 3.61s | x2.63 |
|
||||
| channels last | 3.30s | x2.88 |
|
||||
| traced UNet | 3.21s | x2.96 |
|
||||
| memory efficient attention | 2.63s | x3.61 |
|
||||
|
||||
## Use TensorFloat-32
|
||||
|
||||
On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (TF32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables TF32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling TF32 for matrix multiplications. It can significantly speeds up computations with typically negligible loss in numerical accuracy.
|
||||
On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables tf32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling tf32 for matrix multiplications. It can significantly speed up computations with typically negligible loss in numerical accuracy.
|
||||
|
||||
```python
|
||||
import torch
|
||||
@@ -40,11 +36,11 @@ import torch
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
```
|
||||
|
||||
You can learn more about TF32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.
|
||||
Learn more about tf32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.
|
||||
|
||||
## Half-precision weights
|
||||
|
||||
To save GPU memory and get more speed, try loading and running the model weights directly in half-precision or float16:
|
||||
To save GPU memory and get more speed, set `torch_dtype=torch.float16` to load and run the model weights directly with half-precision weights.
|
||||
|
||||
```Python
|
||||
import torch
|
||||
@@ -56,13 +52,76 @@ pipe = DiffusionPipeline.from_pretrained(
|
||||
use_safetensors=True,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
> [!WARNING]
|
||||
> Don't use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
|
||||
|
||||
Don't use [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
|
||||
## Distilled model
|
||||
|
||||
</Tip>
|
||||
You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size by 51% and improve latency on CPU/GPU by 43%. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
|
||||
|
||||
> [!TIP]
|
||||
> Read the [Open-sourcing Knowledge Distillation Code and Weights of SD-Small and SD-Tiny](https://huggingface.co/blog/sd_distillation) blog post to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
|
||||
|
||||
The inference times below are obtained from generating 4 images from the prompt "a photo of an astronaut riding a horse on mars" with 25 PNDM steps on a NVIDIA A100. Each generation is repeated 3 times with the distilled Stable Diffusion v1.4 model by [Nota AI](https://hf.co/nota-ai).
|
||||
|
||||
| setup | latency | speed-up |
|
||||
|------------------------------|---------|----------|
|
||||
| baseline | 6.37s | x1 |
|
||||
| distilled | 4.18s | x1.52 |
|
||||
| distilled + tiny autoencoder | 3.83s | x1.66 |
|
||||
|
||||
Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
|
||||
distilled = StableDiffusionPipeline.from_pretrained(
|
||||
"nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
|
||||
).to("cuda")
|
||||
prompt = "a golden vase with different flowers"
|
||||
generator = torch.manual_seed(2023)
|
||||
image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
### Tiny AutoEncoder
|
||||
|
||||
To speed inference up even more, replace the autoencoder with a [distilled version](https://huggingface.co/sayakpaul/taesdxl-diffusers) of it.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoencoderTiny, StableDiffusionPipeline
|
||||
|
||||
distilled = StableDiffusionPipeline.from_pretrained(
|
||||
"nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
|
||||
).to("cuda")
|
||||
distilled.vae = AutoencoderTiny.from_pretrained(
|
||||
"sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
|
||||
).to("cuda")
|
||||
|
||||
prompt = "a golden vase with different flowers"
|
||||
generator = torch.manual_seed(2023)
|
||||
image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -261,7 +261,7 @@ from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class UNet2DConditionOutput:
|
||||
sample: torch.FloatTensor
|
||||
sample: torch.Tensor
|
||||
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Overview
|
||||
|
||||
Generating high-quality outputs is computationally intensive, especially during each iterative step where you go from a noisy output to a less noisy output. One of 🤗 Diffuser's goals is to make this technology widely accessible to everyone, which includes enabling fast inference on consumer and specialized hardware.
|
||||
|
||||
This section will cover tips and tricks - like half-precision weights and sliced attention - for optimizing inference speed and reducing memory-consumption. You'll also learn how to speed up your PyTorch code with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) or [ONNX Runtime](https://onnxruntime.ai/docs/), and enable memory-efficient attention with [xFormers](https://facebookresearch.github.io/xformers/). There are also guides for running inference on specific hardware like Apple Silicon, and Intel or Habana processors.
|
||||
182
docs/source/en/optimization/tgate.md
Normal file
182
docs/source/en/optimization/tgate.md
Normal file
@@ -0,0 +1,182 @@
|
||||
# T-GATE
|
||||
|
||||
[T-GATE](https://github.com/HaozheLiu-ST/T-GATE/tree/main) accelerates inference for [Stable Diffusion](../api/pipelines/stable_diffusion/overview), [PixArt](../api/pipelines/pixart), and [Latency Consistency Model](../api/pipelines/latent_consistency_models.md) pipelines by skipping the cross-attention calculation once it converges. This method doesn't require any additional training and it can speed up inference from 10-50%. T-GATE is also compatible with other optimization methods like [DeepCache](./deepcache).
|
||||
|
||||
Before you begin, make sure you install T-GATE.
|
||||
|
||||
```bash
|
||||
pip install tgate
|
||||
pip install -U pytorch diffusers transformers accelerate DeepCache
|
||||
```
|
||||
|
||||
|
||||
To use T-GATE with a pipeline, you need to use its corresponding loader.
|
||||
|
||||
| Pipeline | T-GATE Loader |
|
||||
|---|---|
|
||||
| PixArt | TgatePixArtLoader |
|
||||
| Stable Diffusion XL | TgateSDXLLoader |
|
||||
| Stable Diffusion XL + DeepCache | TgateSDXLDeepCacheLoader |
|
||||
| Stable Diffusion | TgateSDLoader |
|
||||
| Stable Diffusion + DeepCache | TgateSDDeepCacheLoader |
|
||||
|
||||
Next, create a `TgateLoader` with a pipeline, the gate step (the time step to stop calculating the cross attention), and the number of inference steps. Then call the `tgate` method on the pipeline with a prompt, gate step, and the number of inference steps.
|
||||
|
||||
Let's see how to enable this for several different pipelines.
|
||||
|
||||
<hfoptions id="pipelines">
|
||||
<hfoption id="PixArt">
|
||||
|
||||
Accelerate `PixArtAlphaPipeline` with T-GATE:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import PixArtAlphaPipeline
|
||||
from tgate import TgatePixArtLoader
|
||||
|
||||
pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
|
||||
|
||||
gate_step = 8
|
||||
inference_step = 25
|
||||
pipe = TgatePixArtLoader(
|
||||
pipe,
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step,
|
||||
).to("cuda")
|
||||
|
||||
image = pipe.tgate(
|
||||
"An alpaca made of colorful building blocks, cyberpunk.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step,
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="Stable Diffusion XL">
|
||||
|
||||
Accelerate `StableDiffusionXLPipeline` with T-GATE:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from diffusers import DPMSolverMultistepScheduler
|
||||
from tgate import TgateSDXLLoader
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
)
|
||||
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
gate_step = 10
|
||||
inference_step = 25
|
||||
pipe = TgateSDXLLoader(
|
||||
pipe,
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step,
|
||||
).to("cuda")
|
||||
|
||||
image = pipe.tgate(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="StableDiffusionXL with DeepCache">
|
||||
|
||||
Accelerate `StableDiffusionXLPipeline` with [DeepCache](https://github.com/horseee/DeepCache) and T-GATE:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from diffusers import DPMSolverMultistepScheduler
|
||||
from tgate import TgateSDXLDeepCacheLoader
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
)
|
||||
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
gate_step = 10
|
||||
inference_step = 25
|
||||
pipe = TgateSDXLDeepCacheLoader(
|
||||
pipe,
|
||||
cache_interval=3,
|
||||
cache_branch_id=0,
|
||||
).to("cuda")
|
||||
|
||||
image = pipe.tgate(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="Latent Consistency Model">
|
||||
|
||||
Accelerate `latent-consistency/lcm-sdxl` with T-GATE:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from diffusers import UNet2DConditionModel, LCMScheduler
|
||||
from diffusers import DPMSolverMultistepScheduler
|
||||
from tgate import TgateSDXLLoader
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"latent-consistency/lcm-sdxl",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
)
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
unet=unet,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
)
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
gate_step = 1
|
||||
inference_step = 4
|
||||
pipe = TgateSDXLLoader(
|
||||
pipe,
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step,
|
||||
lcm=True
|
||||
).to("cuda")
|
||||
|
||||
image = pipe.tgate(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
T-GATE also supports [`StableDiffusionPipeline`] and [PixArt-alpha/PixArt-LCM-XL-2-1024-MS](https://hf.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS).
|
||||
|
||||
## Benchmarks
|
||||
| Model | MACs | Param | Latency | Zero-shot 10K-FID on MS-COCO |
|
||||
|-----------------------|----------|-----------|---------|---------------------------|
|
||||
| SD-1.5 | 16.938T | 859.520M | 7.032s | 23.927 |
|
||||
| SD-1.5 w/ T-GATE | 9.875T | 815.557M | 4.313s | 20.789 |
|
||||
| SD-2.1 | 38.041T | 865.785M | 16.121s | 22.609 |
|
||||
| SD-2.1 w/ T-GATE | 22.208T | 815.433 M | 9.878s | 19.940 |
|
||||
| SD-XL | 149.438T | 2.570B | 53.187s | 24.628 |
|
||||
| SD-XL w/ T-GATE | 84.438T | 2.024B | 27.932s | 22.738 |
|
||||
| Pixart-Alpha | 107.031T | 611.350M | 61.502s | 38.669 |
|
||||
| Pixart-Alpha w/ T-GATE | 65.318T | 462.585M | 37.867s | 35.825 |
|
||||
| DeepCache (SD-XL) | 57.888T | - | 19.931s | 23.755 |
|
||||
| DeepCache w/ T-GATE | 43.868T | - | 14.666s | 23.999 |
|
||||
| LCM (SD-XL) | 11.955T | 2.570B | 3.805s | 25.044 |
|
||||
| LCM w/ T-GATE | 11.171T | 2.024B | 3.533s | 25.028 |
|
||||
| LCM (Pixart-Alpha) | 8.563T | 611.350M | 4.733s | 36.086 |
|
||||
| LCM w/ T-GATE | 7.623T | 462.585M | 4.543s | 37.048 |
|
||||
|
||||
The latency is tested on an NVIDIA 1080TI, MACs and Params are calculated with [calflops](https://github.com/MrYxJ/calculate-flops.pytorch), and the FID is calculated with [PytorchFID](https://github.com/mseitzer/pytorch-fid).
|
||||
@@ -75,6 +75,9 @@ Compilation requires some time to complete, so it is best suited for situations
|
||||
|
||||
For more information and different options about `torch.compile`, refer to the [`torch_compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) tutorial.
|
||||
|
||||
> [!TIP]
|
||||
> Learn more about other ways PyTorch 2.0 can help optimize your model in the [Accelerate inference of text-to-image diffusion models](../tutorials/fast_diffusion) tutorial.
|
||||
|
||||
## Benchmark
|
||||
|
||||
We conducted a comprehensive benchmark with PyTorch 2.0's efficient attention implementation and `torch.compile` across different GPUs and batch sizes for five of our most used pipelines. The code is benchmarked on 🤗 Diffusers v0.17.0.dev0 to optimize `torch.compile` usage (see [here](https://github.com/huggingface/diffusers/pull/3313) for more details).
|
||||
|
||||
@@ -49,7 +49,7 @@ One of the simplest ways to speed up inference is to place the pipeline on a GPU
|
||||
pipeline = pipeline.to("cuda")
|
||||
```
|
||||
|
||||
To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reproducibility):
|
||||
To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reusing_seeds):
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
@@ -88,7 +88,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -54,7 +54,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -84,7 +84,7 @@ Many of the basic parameters are described in the [DreamBooth](dreambooth#script
|
||||
- `--freeze_model`: freezes the key and value parameters in the cross-attention layer; the default is `crossattn_kv`, but you can set it to `crossattn` to train all the parameters in the cross-attention layer
|
||||
- `--concepts_list`: to learn multiple concepts, provide a path to a JSON file containing the concepts
|
||||
- `--modifier_token`: a special word used to represent the learned concept
|
||||
- `--initializer_token`:
|
||||
- `--initializer_token`: a special word used to initialize the embeddings of the `modifier_token`
|
||||
|
||||
### Prior preservation loss
|
||||
|
||||
|
||||
@@ -52,6 +52,76 @@ To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](h
|
||||
|
||||
</Tip>
|
||||
|
||||
### Device placement
|
||||
|
||||
> [!WARNING]
|
||||
> This feature is experimental and its APIs might change in the future.
|
||||
|
||||
With Accelerate, you can use the `device_map` to determine how to distribute the models of a pipeline across multiple devices. This is useful in situations where you have more than one GPU.
|
||||
|
||||
For example, if you have two 8GB GPUs, then using [`~DiffusionPipeline.enable_model_cpu_offload`] may not work so well because:
|
||||
|
||||
* it only works on a single GPU
|
||||
* a single model might not fit on a single GPU ([`~DiffusionPipeline.enable_sequential_cpu_offload`] might work but it will be extremely slow and it is also limited to a single GPU)
|
||||
|
||||
To make use of both GPUs, you can use the "balanced" device placement strategy which splits the models across all available GPUs.
|
||||
|
||||
> [!WARNING]
|
||||
> Only the "balanced" strategy is supported at the moment, and we plan to support additional mapping strategies in the future.
|
||||
|
||||
```diff
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
- "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
|
||||
+ "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, device_map="balanced"
|
||||
)
|
||||
image = pipeline("a dog").images[0]
|
||||
image
|
||||
```
|
||||
|
||||
You can also pass a dictionary to enforce the maximum GPU memory that can be used on each device:
|
||||
|
||||
```diff
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
max_memory = {0:"1GB", 1:"1GB"}
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
device_map="balanced",
|
||||
+ max_memory=max_memory
|
||||
)
|
||||
image = pipeline("a dog").images[0]
|
||||
image
|
||||
```
|
||||
|
||||
If a device is not present in `max_memory`, then it will be completely ignored and will not participate in the device placement.
|
||||
|
||||
By default, Diffusers uses the maximum memory of all devices. If the models don't fit on the GPUs, they are offloaded to the CPU. If the CPU doesn't have enough memory, then you might see an error. In that case, you could defer to using [`~DiffusionPipeline.enable_sequential_cpu_offload`] and [`~DiffusionPipeline.enable_model_cpu_offload`].
|
||||
|
||||
Call [`~DiffusionPipeline.reset_device_map`] to reset the `device_map` of a pipeline. This is also necessary if you want to use methods like `to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped.
|
||||
|
||||
```py
|
||||
pipeline.reset_device_map()
|
||||
```
|
||||
|
||||
Once a pipeline has been device-mapped, you can also access its device map via `hf_device_map`:
|
||||
|
||||
```py
|
||||
print(pipeline.hf_device_map)
|
||||
```
|
||||
|
||||
An example device map would look like so:
|
||||
|
||||
|
||||
```bash
|
||||
{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
|
||||
```
|
||||
|
||||
## PyTorch Distributed
|
||||
|
||||
PyTorch supports [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) which enables data parallelism.
|
||||
|
||||
@@ -67,7 +67,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -180,7 +180,7 @@ elif args.pretrained_model_name_or_path:
|
||||
revision=args.revision,
|
||||
use_fast=False,
|
||||
)
|
||||
|
||||
|
||||
# Load scheduler and models
|
||||
noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
||||
text_encoder = text_encoder_cls.from_pretrained(
|
||||
|
||||
@@ -51,7 +51,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -89,7 +89,7 @@ The dataset preprocessing code and training loop are found in the [`main()`](htt
|
||||
|
||||
As with the script parameters, a walkthrough of the training script is provided in the [Text-to-image](text2image#training-script) training guide. Instead, this guide takes a look at the InstructPix2Pix relevant parts of the script.
|
||||
|
||||
The script begins by modifing the [number of input channels](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L445) in the first convolutional layer of the UNet to account for InstructPix2Pix's additional conditioning image:
|
||||
The script begins by modifying the [number of input channels](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/instruct_pix2pix/train_instruct_pix2pix.py#L445) in the first convolutional layer of the UNet to account for InstructPix2Pix's additional conditioning image:
|
||||
|
||||
```py
|
||||
in_channels = 8
|
||||
|
||||
@@ -59,7 +59,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -205,7 +205,7 @@ model_pred = unet(noisy_latents, timesteps, None, added_cond_kwargs=added_cond_k
|
||||
|
||||
Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀
|
||||
|
||||
You'll train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon, but you can also create and train on your own dataset by following the [Create a dataset for training](create_dataset) guide. Set the environment variable `DATASET_NAME` to the name of the dataset on the Hub or if you're training on your own files, set the environment variable `TRAIN_DIR` to a path to your dataset.
|
||||
You'll train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters, but you can also create and train on your own dataset by following the [Create a dataset for training](create_dataset) guide. Set the environment variable `DATASET_NAME` to the name of the dataset on the Hub or if you're training on your own files, set the environment variable `TRAIN_DIR` to a path to your dataset.
|
||||
|
||||
If you’re training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
|
||||
|
||||
@@ -219,7 +219,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
|
||||
<hfoption id="prior model">
|
||||
|
||||
```bash
|
||||
export DATASET_NAME="lambdalabs/pokemon-blip-captions"
|
||||
export DATASET_NAME="lambdalabs/naruto-blip-captions"
|
||||
|
||||
accelerate launch --mixed_precision="fp16" train_text_to_image_prior.py \
|
||||
--dataset_name=$DATASET_NAME \
|
||||
@@ -232,17 +232,17 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_prior.py \
|
||||
--checkpoints_total_limit=3 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--validation_prompts="A robot pokemon, 4k photo" \
|
||||
--validation_prompts="A robot naruto, 4k photo" \
|
||||
--report_to="wandb" \
|
||||
--push_to_hub \
|
||||
--output_dir="kandi2-prior-pokemon-model"
|
||||
--output_dir="kandi2-prior-naruto-model"
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="decoder model">
|
||||
|
||||
```bash
|
||||
export DATASET_NAME="lambdalabs/pokemon-blip-captions"
|
||||
export DATASET_NAME="lambdalabs/naruto-blip-captions"
|
||||
|
||||
accelerate launch --mixed_precision="fp16" train_text_to_image_decoder.py \
|
||||
--dataset_name=$DATASET_NAME \
|
||||
@@ -256,10 +256,10 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_decoder.py \
|
||||
--checkpoints_total_limit=3 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--validation_prompts="A robot pokemon, 4k photo" \
|
||||
--validation_prompts="A robot naruto, 4k photo" \
|
||||
--report_to="wandb" \
|
||||
--push_to_hub \
|
||||
--output_dir="kandi2-decoder-pokemon-model"
|
||||
--output_dir="kandi2-decoder-naruto-model"
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -279,7 +279,7 @@ prior_components = {"prior_" + k: v for k,v in prior_pipeline.components.items()
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", **prior_components, torch_dtype=torch.float16)
|
||||
|
||||
pipe.enable_model_cpu_offload()
|
||||
prompt="A robot pokemon, 4k photo"
|
||||
prompt="A robot naruto, 4k photo"
|
||||
image = pipeline(prompt=prompt, negative_prompt=negative_prompt).images[0]
|
||||
```
|
||||
|
||||
@@ -299,7 +299,7 @@ import torch
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torch_dtype=torch.float16)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
|
||||
prompt="A robot pokemon, 4k photo"
|
||||
prompt="A robot naruto, 4k photo"
|
||||
image = pipeline(prompt=prompt).images[0]
|
||||
```
|
||||
|
||||
@@ -313,7 +313,7 @@ unet = UNet2DConditionModel.from_pretrained("path/to/saved/model" + "/checkpoint
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", unet=unet, torch_dtype=torch.float16)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
|
||||
image = pipeline(prompt="A robot pokemon, 4k photo").images[0]
|
||||
image = pipeline(prompt="A robot naruto, 4k photo").images[0]
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
@@ -53,7 +53,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -252,4 +252,4 @@ The SDXL training script is discussed in more detail in the [SDXL training](sdxl
|
||||
Congratulations on distilling a LCM model! To learn more about LCM, the following may be helpful:
|
||||
|
||||
- Learn how to use [LCMs for inference](../using-diffusers/lcm) for text-to-image, image-to-image, and with LoRA checkpoints.
|
||||
- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
|
||||
- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
|
||||
|
||||
@@ -77,7 +77,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -113,36 +113,50 @@ The dataset preprocessing code and training loop are found in the [`main()`](htt
|
||||
|
||||
As with the script parameters, a walkthrough of the training script is provided in the [Text-to-image](text2image#training-script) training guide. Instead, this guide takes a look at the LoRA relevant parts of the script.
|
||||
|
||||
The script begins by adding the [new LoRA weights](https://github.com/huggingface/diffusers/blob/dd9a5caf61f04d11c0fa9f3947b69ab0010c9a0f/examples/text_to_image/train_text_to_image_lora.py#L447) to the attention layers. This involves correctly configuring the weight size for each block in the UNet. You'll see the `rank` parameter is used to create the [`~models.attention_processor.LoRAAttnProcessor`]:
|
||||
<hfoptions id="lora">
|
||||
<hfoption id="UNet">
|
||||
|
||||
Diffusers uses [`~peft.LoraConfig`] from the [PEFT](https://hf.co/docs/peft) library to set up the parameters of the LoRA adapter such as the rank, alpha, and which modules to insert the LoRA weights into. The adapter is added to the UNet, and only the LoRA layers are filtered for optimization in `lora_layers`.
|
||||
|
||||
```py
|
||||
lora_attn_procs = {}
|
||||
for name in unet.attn_processors.keys():
|
||||
cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
|
||||
if name.startswith("mid_block"):
|
||||
hidden_size = unet.config.block_out_channels[-1]
|
||||
elif name.startswith("up_blocks"):
|
||||
block_id = int(name[len("up_blocks.")])
|
||||
hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
|
||||
elif name.startswith("down_blocks"):
|
||||
block_id = int(name[len("down_blocks.")])
|
||||
hidden_size = unet.config.block_out_channels[block_id]
|
||||
unet_lora_config = LoraConfig(
|
||||
r=args.rank,
|
||||
lora_alpha=args.rank,
|
||||
init_lora_weights="gaussian",
|
||||
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
|
||||
)
|
||||
|
||||
lora_attn_procs[name] = LoRAAttnProcessor(
|
||||
hidden_size=hidden_size,
|
||||
cross_attention_dim=cross_attention_dim,
|
||||
rank=args.rank,
|
||||
)
|
||||
|
||||
unet.set_attn_processor(lora_attn_procs)
|
||||
lora_layers = AttnProcsLayers(unet.attn_processors)
|
||||
unet.add_adapter(unet_lora_config)
|
||||
lora_layers = filter(lambda p: p.requires_grad, unet.parameters())
|
||||
```
|
||||
|
||||
The [optimizer](https://github.com/huggingface/diffusers/blob/dd9a5caf61f04d11c0fa9f3947b69ab0010c9a0f/examples/text_to_image/train_text_to_image_lora.py#L519) is initialized with the `lora_layers` because these are the only weights that'll be optimized:
|
||||
</hfoption>
|
||||
<hfoption id="text encoder">
|
||||
|
||||
Diffusers also supports finetuning the text encoder with LoRA from the [PEFT](https://hf.co/docs/peft) library when necessary such as finetuning Stable Diffusion XL (SDXL). The [`~peft.LoraConfig`] is used to configure the parameters of the LoRA adapter which are then added to the text encoder, and only the LoRA layers are filtered for training.
|
||||
|
||||
```py
|
||||
text_lora_config = LoraConfig(
|
||||
r=args.rank,
|
||||
lora_alpha=args.rank,
|
||||
init_lora_weights="gaussian",
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
|
||||
)
|
||||
|
||||
text_encoder_one.add_adapter(text_lora_config)
|
||||
text_encoder_two.add_adapter(text_lora_config)
|
||||
text_lora_parameters_one = list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
|
||||
text_lora_parameters_two = list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
The [optimizer](https://github.com/huggingface/diffusers/blob/e4b8f173b97731686e290b2eb98e7f5df2b1b322/examples/text_to_image/train_text_to_image_lora.py#L529) is initialized with the `lora_layers` because these are the only weights that'll be optimized:
|
||||
|
||||
```py
|
||||
optimizer = optimizer_cls(
|
||||
lora_layers.parameters(),
|
||||
lora_layers,
|
||||
lr=args.learning_rate,
|
||||
betas=(args.adam_beta1, args.adam_beta2),
|
||||
weight_decay=args.adam_weight_decay,
|
||||
@@ -156,7 +170,7 @@ Aside from setting up the LoRA layers, the training script is more or less the s
|
||||
|
||||
Once you've made all your changes or you're okay with the default configuration, you're ready to launch the training script! 🚀
|
||||
|
||||
Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate our yown Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:
|
||||
Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:
|
||||
|
||||
- saved model checkpoints
|
||||
- `pytorch_lora_weights.safetensors` (the trained LoRA weights)
|
||||
@@ -171,9 +185,9 @@ A full training run takes ~5 hours on a 2080 Ti GPU with 11GB of VRAM.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
|
||||
export OUTPUT_DIR="/sddata/finetune/lora/pokemon"
|
||||
export HUB_MODEL_ID="pokemon-lora"
|
||||
export DATASET_NAME="lambdalabs/pokemon-blip-captions"
|
||||
export OUTPUT_DIR="/sddata/finetune/lora/naruto"
|
||||
export HUB_MODEL_ID="naruto-lora"
|
||||
export DATASET_NAME="lambdalabs/naruto-blip-captions"
|
||||
|
||||
accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
@@ -194,7 +208,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py \
|
||||
--hub_model_id=${HUB_MODEL_ID} \
|
||||
--report_to=wandb \
|
||||
--checkpointing_steps=500 \
|
||||
--validation_prompt="A pokemon with blue eyes." \
|
||||
--validation_prompt="A naruto with blue eyes." \
|
||||
--seed=1337
|
||||
```
|
||||
|
||||
@@ -206,7 +220,7 @@ import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_lora_weights("path/to/lora/model", weight_name="pytorch_lora_weights.safetensors")
|
||||
image = pipeline("A pokemon with blue eyes").images[0]
|
||||
image = pipeline("A naruto with blue eyes").images[0]
|
||||
```
|
||||
|
||||
## Next steps
|
||||
|
||||
@@ -59,7 +59,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -176,7 +176,7 @@ If you want to learn more about how the training loop works, check out the [Unde
|
||||
|
||||
Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀
|
||||
|
||||
Let’s train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and the dataset (either from the Hub or a local path). You should also specify a VAE other than the SDXL VAE (either from the Hub or a local path) with `VAE_NAME` to avoid numerical instabilities.
|
||||
Let’s train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and the dataset (either from the Hub or a local path). You should also specify a VAE other than the SDXL VAE (either from the Hub or a local path) with `VAE_NAME` to avoid numerical instabilities.
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -187,7 +187,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
|
||||
```bash
|
||||
export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
|
||||
export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
|
||||
export DATASET_NAME="lambdalabs/pokemon-blip-captions"
|
||||
export DATASET_NAME="lambdalabs/naruto-blip-captions"
|
||||
|
||||
accelerate launch train_text_to_image_sdxl.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
@@ -211,7 +211,7 @@ accelerate launch train_text_to_image_sdxl.py \
|
||||
--validation_prompt="a cute Sundar Pichai creature" \
|
||||
--validation_epochs 5 \
|
||||
--checkpointing_steps=5000 \
|
||||
--output_dir="sdxl-pokemon-model" \
|
||||
--output_dir="sdxl-naruto-model" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
@@ -226,9 +226,9 @@ import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("path/to/your/model", torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
prompt = "A pokemon with green eyes and red legs."
|
||||
prompt = "A naruto with green eyes and red legs."
|
||||
image = pipeline(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
|
||||
image.save("pokemon.png")
|
||||
image.save("naruto.png")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -244,11 +244,11 @@ import torch_xla.core.xla_model as xm
|
||||
device = xm.xla_device()
|
||||
pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0").to(device)
|
||||
|
||||
prompt = "A pokemon with green eyes and red legs."
|
||||
prompt = "A naruto with green eyes and red legs."
|
||||
start = time()
|
||||
image = pipeline(prompt, num_inference_steps=inference_steps).images[0]
|
||||
print(f'Compilation time is {time()-start} sec')
|
||||
image.save("pokemon.png")
|
||||
image.save("naruto.png")
|
||||
|
||||
start = time()
|
||||
image = pipeline(prompt, num_inference_steps=inference_steps).images[0]
|
||||
|
||||
@@ -53,7 +53,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -69,7 +69,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -158,7 +158,7 @@ Once you've made all your changes or you're okay with the default configuration,
|
||||
<hfoptions id="training-inference">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
|
||||
Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -168,7 +168,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
|
||||
export dataset_name="lambdalabs/pokemon-blip-captions"
|
||||
export dataset_name="lambdalabs/naruto-blip-captions"
|
||||
|
||||
accelerate launch --mixed_precision="fp16" train_text_to_image.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
@@ -183,7 +183,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \
|
||||
--max_grad_norm=1 \
|
||||
--enable_xformers_memory_efficient_attention
|
||||
--lr_scheduler="constant" --lr_warmup_steps=0 \
|
||||
--output_dir="sd-pokemon-model" \
|
||||
--output_dir="sd-naruto-model" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
@@ -202,7 +202,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
|
||||
export dataset_name="lambdalabs/pokemon-blip-captions"
|
||||
export dataset_name="lambdalabs/naruto-blip-captions"
|
||||
|
||||
python train_text_to_image_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
@@ -212,7 +212,7 @@ python train_text_to_image_flax.py \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-05 \
|
||||
--max_grad_norm=1 \
|
||||
--output_dir="sd-pokemon-model" \
|
||||
--output_dir="sd-naruto-model" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
@@ -231,7 +231,7 @@ import torch
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("path/to/saved_model", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
|
||||
|
||||
image = pipeline(prompt="yoda").images[0]
|
||||
image.save("yoda-pokemon.png")
|
||||
image.save("yoda-naruto.png")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -246,7 +246,7 @@ from diffusers import FlaxStableDiffusionPipeline
|
||||
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained("path/to/saved_model", dtype=jax.numpy.bfloat16)
|
||||
|
||||
prompt = "yoda pokemon"
|
||||
prompt = "yoda naruto"
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 50
|
||||
|
||||
@@ -261,7 +261,7 @@ prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
image.save("yoda-pokemon.png")
|
||||
image.save("yoda-naruto.png")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
@@ -67,7 +67,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -51,7 +51,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
|
||||
@@ -53,7 +53,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```bash
|
||||
```py
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -131,7 +131,7 @@ If you want to learn more about how the training loop works, check out the [Unde
|
||||
|
||||
Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀
|
||||
|
||||
Set the `DATASET_NAME` environment variable to the dataset name from the Hub. This guide uses the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset, but you can create and train on your own datasets as well (see the [Create a dataset for training](create_dataset) guide).
|
||||
Set the `DATASET_NAME` environment variable to the dataset name from the Hub. This guide uses the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset, but you can create and train on your own datasets as well (see the [Create a dataset for training](create_dataset) guide).
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -140,7 +140,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
|
||||
</Tip>
|
||||
|
||||
```bash
|
||||
export DATASET_NAME="lambdalabs/pokemon-blip-captions"
|
||||
export DATASET_NAME="lambdalabs/naruto-blip-captions"
|
||||
|
||||
accelerate launch train_text_to_image_prior.py \
|
||||
--mixed_precision="fp16" \
|
||||
@@ -156,10 +156,10 @@ accelerate launch train_text_to_image_prior.py \
|
||||
--checkpoints_total_limit=3 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--validation_prompts="A robot pokemon, 4k photo" \
|
||||
--validation_prompts="A robot naruto, 4k photo" \
|
||||
--report_to="wandb" \
|
||||
--push_to_hub \
|
||||
--output_dir="wuerstchen-prior-pokemon-model"
|
||||
--output_dir="wuerstchen-prior-naruto-model"
|
||||
```
|
||||
|
||||
Once training is complete, you can use your newly trained model for inference!
|
||||
@@ -171,9 +171,9 @@ from diffusers.pipelines.wuerstchen import DEFAULT_STAGE_C_TIMESTEPS
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
caption = "A cute bird pokemon holding a shield"
|
||||
caption = "A cute bird naruto holding a shield"
|
||||
images = pipeline(
|
||||
caption,
|
||||
caption,
|
||||
width=1024,
|
||||
height=1536,
|
||||
prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
|
||||
|
||||
@@ -12,75 +12,74 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# AutoPipeline
|
||||
|
||||
🤗 Diffusers is able to complete many different tasks, and you can often reuse the same pretrained weights for multiple tasks such as text-to-image, image-to-image, and inpainting. If you're new to the library and diffusion models though, it may be difficult to know which pipeline to use for a task. For example, if you're using the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint for text-to-image, you might not know that you could also use it for image-to-image and inpainting by loading the checkpoint with the [`StableDiffusionImg2ImgPipeline`] and [`StableDiffusionInpaintPipeline`] classes respectively.
|
||||
Diffusers provides many pipelines for basic tasks like generating images, videos, audio, and inpainting. On top of these, there are specialized pipelines for adapters and features like upscaling, super-resolution, and more. Different pipeline classes can even use the same checkpoint because they share the same pretrained model! With so many different pipelines, it can be overwhelming to know which pipeline class to use.
|
||||
|
||||
The `AutoPipeline` class is designed to simplify the variety of pipelines in 🤗 Diffusers. It is a generic, *task-first* pipeline that lets you focus on the task. The `AutoPipeline` automatically detects the correct pipeline class to use, which makes it easier to load a checkpoint for a task without knowing the specific pipeline class name.
|
||||
The [AutoPipeline](../api/pipelines/auto_pipeline) class is designed to simplify the variety of pipelines in Diffusers. It is a generic *task-first* pipeline that lets you focus on a task ([`AutoPipelineForText2Image`], [`AutoPipelineForImage2Image`], and [`AutoPipelineForInpainting`]) without needing to know the specific pipeline class. The [AutoPipeline](../api/pipelines/auto_pipeline) automatically detects the correct pipeline class to use.
|
||||
|
||||
<Tip>
|
||||
For example, let's use the [dreamlike-art/dreamlike-photoreal-2.0](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0) checkpoint.
|
||||
|
||||
Take a look at the [AutoPipeline](../api/pipelines/auto_pipeline) reference to see which tasks are supported. Currently, it supports text-to-image, image-to-image, and inpainting.
|
||||
Under the hood, [AutoPipeline](../api/pipelines/auto_pipeline):
|
||||
|
||||
</Tip>
|
||||
1. Detects a `"stable-diffusion"` class from the [model_index.json](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0/blob/main/model_index.json) file.
|
||||
2. Depending on the task you're interested in, it loads the [`StableDiffusionPipeline`], [`StableDiffusionImg2ImgPipeline`], or [`StableDiffusionInpaintPipeline`]. Any parameter (`strength`, `num_inference_steps`, etc.) you would pass to these specific pipelines can also be passed to the [AutoPipeline](../api/pipelines/auto_pipeline).
|
||||
|
||||
This tutorial shows you how to use an `AutoPipeline` to automatically infer the pipeline class to load for a specific task, given the pretrained weights.
|
||||
|
||||
## Choose an AutoPipeline for your task
|
||||
|
||||
Start by picking a checkpoint. For example, if you're interested in text-to-image with the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint, use [`AutoPipelineForText2Image`]:
|
||||
<hfoptions id="autopipeline">
|
||||
<hfoption id="text-to-image">
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
pipe_txt2img = AutoPipelineForText2Image.from_pretrained(
|
||||
"dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
prompt = "peasant and dragon combat, wood cutting style, viking era, bevel with rune"
|
||||
|
||||
image = pipeline(prompt, num_inference_steps=25).images[0]
|
||||
prompt = "cinematic photo of Godzilla eating sushi with a cat in a izakaya, 35mm photograph, film, professional, 4k, highly detailed"
|
||||
generator = torch.Generator(device="cpu").manual_seed(37)
|
||||
image = pipe_txt2img(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-text2img.png" alt="generated image of peasant fighting dragon in wood cutting style"/>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-text2img.png"/>
|
||||
</div>
|
||||
|
||||
Under the hood, [`AutoPipelineForText2Image`]:
|
||||
|
||||
1. automatically detects a `"stable-diffusion"` class from the [`model_index.json`](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json) file
|
||||
2. loads the corresponding text-to-image [`StableDiffusionPipeline`] based on the `"stable-diffusion"` class name
|
||||
|
||||
Likewise, for image-to-image, [`AutoPipelineForImage2Image`] detects a `"stable-diffusion"` checkpoint from the `model_index.json` file and it'll load the corresponding [`StableDiffusionImg2ImgPipeline`] behind the scenes. You can also pass any additional arguments specific to the pipeline class such as `strength`, which determines the amount of noise or variation added to an input image:
|
||||
</hfoption>
|
||||
<hfoption id="image-to-image">
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
import requests
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
pipe_img2img = AutoPipelineForImage2Image.from_pretrained(
|
||||
"dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
prompt = "a portrait of a dog wearing a pearl earring"
|
||||
|
||||
url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg"
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-text2img.png")
|
||||
|
||||
response = requests.get(url)
|
||||
image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
image.thumbnail((768, 768))
|
||||
|
||||
image = pipeline(prompt, image, num_inference_steps=200, strength=0.75, guidance_scale=10.5).images[0]
|
||||
prompt = "cinematic photo of Godzilla eating burgers with a cat in a fast food restaurant, 35mm photograph, film, professional, 4k, highly detailed"
|
||||
generator = torch.Generator(device="cpu").manual_seed(53)
|
||||
image = pipe_img2img(prompt, image=init_image, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
Notice how the [dreamlike-art/dreamlike-photoreal-2.0](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0) checkpoint is used for both text-to-image and image-to-image tasks? To save memory and avoid loading the checkpoint twice, use the [`~DiffusionPipeline.from_pipe`] method.
|
||||
|
||||
```py
|
||||
pipe_img2img = AutoPipelineForImage2Image.from_pipe(pipe_txt2img).to("cuda")
|
||||
image = pipeline(prompt, image=init_image, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
You can learn more about the [`~DiffusionPipeline.from_pipe`] method in the [Reuse a pipeline](../using-diffusers/loading#reuse-a-pipeline) guide.
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-img2img.png" alt="generated image of a vermeer portrait of a dog wearing a pearl earring"/>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-img2img.png"/>
|
||||
</div>
|
||||
|
||||
And if you want to do inpainting, then [`AutoPipelineForInpainting`] loads the underlying [`StableDiffusionInpaintPipeline`] class in the same way:
|
||||
</hfoption>
|
||||
<hfoption id="inpainting">
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
@@ -91,22 +90,27 @@ pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True
|
||||
).to("cuda")
|
||||
|
||||
img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
|
||||
mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-img2img.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-mask.png")
|
||||
|
||||
init_image = load_image(img_url).convert("RGB")
|
||||
mask_image = load_image(mask_url).convert("RGB")
|
||||
|
||||
prompt = "A majestic tiger sitting on a bench"
|
||||
image = pipeline(prompt, image=init_image, mask_image=mask_image, num_inference_steps=50, strength=0.80).images[0]
|
||||
prompt = "cinematic photo of a owl, 35mm photograph, film, professional, 4k, highly detailed"
|
||||
generator = torch.Generator(device="cpu").manual_seed(38)
|
||||
image = pipeline(prompt, image=init_image, mask_image=mask_image, generator=generator, strength=0.4).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png" alt="generated image of a tiger sitting on a bench"/>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png"/>
|
||||
</div>
|
||||
|
||||
If you try to load an unsupported checkpoint, it'll throw an error:
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Unsupported checkpoints
|
||||
|
||||
The [AutoPipeline](../api/pipelines/auto_pipeline) supports [Stable Diffusion](../api/pipelines/stable_diffusion/overview), [Stable Diffusion XL](../api/pipelines/stable_diffusion/stable_diffusion_xl), [ControlNet](../api/pipelines/controlnet), [Kandinsky 2.1](../api/pipelines/kandinsky.md), [Kandinsky 2.2](../api/pipelines/kandinsky_v22), and [DeepFloyd IF](../api/pipelines/deepfloyd_if) checkpoints.
|
||||
|
||||
If you try to load an unsupported checkpoint, you'll get an error.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
@@ -117,54 +121,3 @@ pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
)
|
||||
"ValueError: AutoPipeline can't find a pipeline linked to ShapEImg2ImgPipeline for None"
|
||||
```
|
||||
|
||||
## Use multiple pipelines
|
||||
|
||||
For some workflows or if you're loading many pipelines, it is more memory-efficient to reuse the same components from a checkpoint instead of reloading them which would unnecessarily consume additional memory. For example, if you're using a checkpoint for text-to-image and you want to use it again for image-to-image, use the [`~AutoPipelineForImage2Image.from_pipe`] method. This method creates a new pipeline from the components of a previously loaded pipeline at no additional memory cost.
|
||||
|
||||
The [`~AutoPipelineForImage2Image.from_pipe`] method detects the original pipeline class and maps it to the new pipeline class corresponding to the task you want to do. For example, if you load a `"stable-diffusion"` class pipeline for text-to-image:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
|
||||
import torch
|
||||
|
||||
pipeline_text2img = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
print(type(pipeline_text2img))
|
||||
"<class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'>"
|
||||
```
|
||||
|
||||
Then [`~AutoPipelineForImage2Image.from_pipe`] maps the original `"stable-diffusion"` pipeline class to [`StableDiffusionImg2ImgPipeline`]:
|
||||
|
||||
```py
|
||||
pipeline_img2img = AutoPipelineForImage2Image.from_pipe(pipeline_text2img)
|
||||
print(type(pipeline_img2img))
|
||||
"<class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline'>"
|
||||
```
|
||||
|
||||
If you passed an optional argument - like disabling the safety checker - to the original pipeline, this argument is also passed on to the new pipeline:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
|
||||
import torch
|
||||
|
||||
pipeline_text2img = AutoPipelineForText2Image.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
requires_safety_checker=False,
|
||||
).to("cuda")
|
||||
|
||||
pipeline_img2img = AutoPipelineForImage2Image.from_pipe(pipeline_text2img)
|
||||
print(pipeline_img2img.config.requires_safety_checker)
|
||||
"False"
|
||||
```
|
||||
|
||||
You can overwrite any of the arguments and even configuration from the original pipeline if you want to change the behavior of the new pipeline. For example, to turn the safety checker back on and add the `strength` argument:
|
||||
|
||||
```py
|
||||
pipeline_img2img = AutoPipelineForImage2Image.from_pipe(pipeline_text2img, requires_safety_checker=True, strength=0.3)
|
||||
print(pipeline_img2img.config.requires_safety_checker)
|
||||
"True"
|
||||
```
|
||||
|
||||
@@ -14,19 +14,17 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Load LoRAs for inference
|
||||
|
||||
There are many adapters (with LoRAs being the most common type) trained in different styles to achieve different effects. You can even combine multiple adapters to create new and unique images. With the 🤗 [PEFT](https://huggingface.co/docs/peft/index) integration in 🤗 Diffusers, it is really easy to load and manage adapters for inference. In this guide, you'll learn how to use different adapters with [Stable Diffusion XL (SDXL)](../api/pipelines/stable_diffusion/stable_diffusion_xl) for inference.
|
||||
There are many adapter types (with [LoRAs](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) being the most popular) trained in different styles to achieve different effects. You can even combine multiple adapters to create new and unique images.
|
||||
|
||||
Throughout this guide, you'll use LoRA as the main adapter technique, so we'll use the terms LoRA and adapter interchangeably. You should have some familiarity with LoRA, and if you don't, we welcome you to check out the [LoRA guide](https://huggingface.co/docs/peft/conceptual_guides/lora).
|
||||
In this tutorial, you'll learn how to easily load and manage adapters for inference with the 🤗 [PEFT](https://huggingface.co/docs/peft/index) integration in 🤗 Diffusers. You'll use LoRA as the main adapter technique, so you'll see the terms LoRA and adapter used interchangeably.
|
||||
|
||||
Let's first install all the required libraries.
|
||||
|
||||
```bash
|
||||
!pip install -q transformers accelerate
|
||||
!pip install peft
|
||||
!pip install diffusers
|
||||
!pip install -q transformers accelerate peft diffusers
|
||||
```
|
||||
|
||||
Now, let's load a pipeline with a SDXL checkpoint:
|
||||
Now, load a pipeline with a [Stable Diffusion XL (SDXL)](../api/pipelines/stable_diffusion/stable_diffusion_xl) checkpoint:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -36,21 +34,18 @@ pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
pipe = DiffusionPipeline.from_pretrained(pipe_id, torch_dtype=torch.float16).to("cuda")
|
||||
```
|
||||
|
||||
|
||||
Next, load a LoRA checkpoint with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method.
|
||||
|
||||
With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which let's you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.
|
||||
Next, load a [CiroN2022/toy-face](https://huggingface.co/CiroN2022/toy-face) adapter with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method. With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which let's you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.
|
||||
|
||||
```python
|
||||
pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
|
||||
```
|
||||
|
||||
And then perform inference:
|
||||
Make sure to include the token `toy_face` in the prompt and then you can perform inference:
|
||||
|
||||
```python
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
|
||||
lora_scale= 0.9
|
||||
lora_scale = 0.9
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
|
||||
).images[0]
|
||||
@@ -59,17 +54,16 @@ image
|
||||
|
||||

|
||||
|
||||
With the `adapter_name` parameter, it is really easy to use another adapter for inference! Load the [nerijs/pixel-art-xl](https://huggingface.co/nerijs/pixel-art-xl) adapter that has been fine-tuned to generate pixel art images and call it `"pixel"`.
|
||||
|
||||
With the `adapter_name` parameter, it is really easy to use another adapter for inference! Load the [nerijs/pixel-art-xl](https://huggingface.co/nerijs/pixel-art-xl) adapter that has been fine-tuned to generate pixel art images, and let's call it `"pixel"`.
|
||||
|
||||
The pipeline automatically sets the first loaded adapter (`"toy"`) as the active adapter. But you can activate the `"pixel"` adapter with the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method as shown below:
|
||||
The pipeline automatically sets the first loaded adapter (`"toy"`) as the active adapter, but you can activate the `"pixel"` adapter with the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method:
|
||||
|
||||
```python
|
||||
pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
|
||||
pipe.set_adapters("pixel")
|
||||
```
|
||||
|
||||
Let's now generate an image with the second adapter and check the result:
|
||||
Make sure you include the token `pixel art` in your prompt to generate a pixel art image:
|
||||
|
||||
```python
|
||||
prompt = "a hacker with a hoodie, pixel art"
|
||||
@@ -81,29 +75,25 @@ image
|
||||
|
||||

|
||||
|
||||
## Combine multiple adapters
|
||||
## Merge adapters
|
||||
|
||||
You can also perform multi-adapter inference where you combine different adapter checkpoints for inference.
|
||||
You can also merge different adapter checkpoints for inference to blend their styles together.
|
||||
|
||||
Once again, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method to activate two LoRA checkpoints and specify the weight for how the checkpoints should be combined.
|
||||
Once again, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method to activate the `pixel` and `toy` adapters and specify the weights for how they should be merged.
|
||||
|
||||
```python
|
||||
pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
|
||||
```
|
||||
|
||||
Now that we have set these two adapters, let's generate an image from the combined adapters!
|
||||
|
||||
<Tip>
|
||||
|
||||
LoRA checkpoints in the diffusion community are almost always obtained with [DreamBooth](https://huggingface.co/docs/diffusers/main/en/training/dreambooth). DreamBooth training often relies on "trigger" words in the input text prompts in order for the generation results to look as expected. When you combine multiple LoRA checkpoints, it's important to ensure the trigger words for the corresponding LoRA checkpoints are present in the input text prompts.
|
||||
|
||||
</Tip>
|
||||
|
||||
The trigger words for [CiroN2022/toy-face](https://hf.co/CiroN2022/toy-face) and [nerijs/pixel-art-xl](https://hf.co/nerijs/pixel-art-xl) are found in their repositories.
|
||||
|
||||
Remember to use the trigger words for [CiroN2022/toy-face](https://hf.co/CiroN2022/toy-face) and [nerijs/pixel-art-xl](https://hf.co/nerijs/pixel-art-xl) (these are found in their repositories) in the prompt to generate an image.
|
||||
|
||||
```python
|
||||
# Notice how the prompt is constructed.
|
||||
prompt = "toy_face of a hacker with a hoodie, pixel art"
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": 1.0}, generator=torch.manual_seed(0)
|
||||
@@ -113,43 +103,95 @@ image
|
||||
|
||||

|
||||
|
||||
Impressive! As you can see, the model was able to generate an image that mixes the characteristics of both adapters.
|
||||
Impressive! As you can see, the model generated an image that mixed the characteristics of both adapters.
|
||||
|
||||
If you want to go back to using only one adapter, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method to activate the `"toy"` adapter:
|
||||
> [!TIP]
|
||||
> Through its PEFT integration, Diffusers also offers more efficient merging methods which you can learn about in the [Merge LoRAs](../using-diffusers/merge_loras) guide!
|
||||
|
||||
To return to only using one adapter, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method to activate the `"toy"` adapter:
|
||||
|
||||
```python
|
||||
# First, set the adapter.
|
||||
pipe.set_adapters("toy")
|
||||
|
||||
# Then, run inference.
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
lora_scale= 0.9
|
||||
lora_scale = 0.9
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
If you want to switch to only the base model, disable all LoRAs with the [`~diffusers.loaders.UNet2DConditionLoadersMixin.disable_lora`] method.
|
||||
|
||||
Or to disable all adapters entirely, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.disable_lora`] method to return the base model.
|
||||
|
||||
```python
|
||||
pipe.disable_lora()
|
||||
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
lora_scale= 0.9
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Monitoring active adapters
|
||||
### Customize adapters strength
|
||||
For even more customization, you can control how strongly the adapter affects each part of the pipeline. For this, pass a dictionary with the control strengths (called "scales") to [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`].
|
||||
|
||||
You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, you can easily check the list of active adapters using the [`~diffusers.loaders.LoraLoaderMixin.get_active_adapters`] method:
|
||||
For example, here's how you can turn on the adapter for the `down` parts, but turn it off for the `mid` and `up` parts:
|
||||
```python
|
||||
pipe.enable_lora() # enable lora again, after we disabled it above
|
||||
prompt = "toy_face of a hacker with a hoodie, pixel art"
|
||||
adapter_weight_scales = { "unet": { "down": 1, "mid": 0, "up": 0} }
|
||||
pipe.set_adapters("pixel", adapter_weight_scales)
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
Let's see how turning off the `down` part and turning on the `mid` and `up` part respectively changes the image.
|
||||
```python
|
||||
adapter_weight_scales = { "unet": { "down": 0, "mid": 1, "up": 0} }
|
||||
pipe.set_adapters("pixel", adapter_weight_scales)
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
```python
|
||||
adapter_weight_scales = { "unet": { "down": 0, "mid": 0, "up": 1} }
|
||||
pipe.set_adapters("pixel", adapter_weight_scales)
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
Looks cool!
|
||||
|
||||
This is a really powerful feature. You can use it to control the adapter strengths down to per-transformer level. And you can even use it for multiple adapters.
|
||||
```python
|
||||
adapter_weight_scales_toy = 0.5
|
||||
adapter_weight_scales_pixel = {
|
||||
"unet": {
|
||||
"down": 0.9, # all transformers in the down-part will use scale 0.9
|
||||
# "mid" # because, in this example, "mid" is not given, all transformers in the mid part will use the default scale 1.0
|
||||
"up": {
|
||||
"block_0": 0.6, # all 3 transformers in the 0th block in the up-part will use scale 0.6
|
||||
"block_1": [0.4, 0.8, 1.0], # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
|
||||
}
|
||||
}
|
||||
}
|
||||
pipe.set_adapters(["toy", "pixel"], [adapter_weight_scales_toy, adapter_weight_scales_pixel])
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Manage active adapters
|
||||
|
||||
You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.LoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:
|
||||
|
||||
```py
|
||||
active_adapters = pipe.get_active_adapters()
|
||||
@@ -164,55 +206,3 @@ list_adapters_component_wise = pipe.get_list_adapters()
|
||||
list_adapters_component_wise
|
||||
{"text_encoder": ["toy", "pixel"], "unet": ["toy", "pixel"], "text_encoder_2": ["toy", "pixel"]}
|
||||
```
|
||||
|
||||
## Fusing adapters into the model
|
||||
|
||||
You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~diffusers.loaders.LoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.
|
||||
|
||||
```py
|
||||
pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
|
||||
pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
|
||||
|
||||
pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
|
||||
# Fuses the LoRAs into the Unet
|
||||
pipe.fuse_lora()
|
||||
|
||||
prompt = "toy_face of a hacker with a hoodie, pixel art"
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
|
||||
# Gets the Unet back to the original state
|
||||
pipe.unfuse_lora()
|
||||
```
|
||||
|
||||
You can also fuse some adapters using `adapter_names` for faster generation:
|
||||
|
||||
```py
|
||||
pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
|
||||
pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
|
||||
|
||||
pipe.set_adapters(["pixel"], adapter_weights=[0.5, 1.0])
|
||||
# Fuses the LoRAs into the Unet
|
||||
pipe.fuse_lora(adapter_names=["pixel"])
|
||||
|
||||
prompt = "a hacker with a hoodie, pixel art"
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
|
||||
# Gets the Unet back to the original state
|
||||
pipe.unfuse_lora()
|
||||
|
||||
# Fuse all adapters
|
||||
pipe.fuse_lora(adapter_names=["pixel", "toy"])
|
||||
|
||||
prompt = "toy_face of a hacker with a hoodie, pixel art"
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
```
|
||||
|
||||
## Saving a pipeline after fusing the adapters
|
||||
|
||||
To properly save a pipeline after it's been loaded with the adapters, it should be serialized like so:
|
||||
|
||||
```python
|
||||
pipe.fuse_lora(lora_scale=1.0)
|
||||
pipe.unload_lora_weights()
|
||||
pipe.save_pretrained("path-to-pipeline")
|
||||
```
|
||||
|
||||
@@ -12,13 +12,18 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Pipeline callbacks
|
||||
|
||||
The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. This can be really useful for *dynamically* adjusting certain pipeline attributes, or modifying tensor variables. The flexibility of callbacks opens up some interesting use-cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale.
|
||||
The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. The callback function is executed at the end of each step, and modifies the pipeline attributes and variables for the next step. This is really useful for *dynamically* adjusting certain pipeline attributes or modifying tensor variables. This versatility allows for interesting use-cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale. With callbacks, you can implement new features without modifying the underlying code!
|
||||
|
||||
This guide will show you how to use the `callback_on_step_end` parameter to disable classifier-free guidance (CFG) after 40% of the inference steps to save compute with minimal cost to performance.
|
||||
> [!TIP]
|
||||
> 🤗 Diffusers currently only supports `callback_on_step_end`, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require a callback function with a different execution point!
|
||||
|
||||
The callback function should have the following arguments:
|
||||
This guide will demonstrate how callbacks work by a few features you can implement with them.
|
||||
|
||||
* `pipe` (or the pipeline instance) provides access to useful properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipe._guidance_scale=0.0`.
|
||||
## Dynamic classifier-free guidance
|
||||
|
||||
Dynamic classifier-free guidance (CFG) is a feature that allows you to disable CFG after a certain number of inference steps which can help you save compute with minimal cost to performance. The callback function for this should have the following arguments:
|
||||
|
||||
* `pipeline` (or the pipeline instance) provides access to important properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipeline._guidance_scale=0.0`.
|
||||
* `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`.
|
||||
* `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly.
|
||||
|
||||
@@ -27,13 +32,13 @@ Your callback function should look something like this:
|
||||
```python
|
||||
def callback_dynamic_cfg(pipe, step_index, timestep, callback_kwargs):
|
||||
# adjust the batch_size of prompt_embeds according to guidance_scale
|
||||
if step_index == int(pipe.num_timesteps * 0.4):
|
||||
if step_index == int(pipeline.num_timesteps * 0.4):
|
||||
prompt_embeds = callback_kwargs["prompt_embeds"]
|
||||
prompt_embeds = prompt_embeds.chunk(2)[-1]
|
||||
|
||||
# update guidance_scale and prompt_embeds
|
||||
pipe._guidance_scale = 0.0
|
||||
callback_kwargs["prompt_embeds"] = prompt_embeds
|
||||
# update guidance_scale and prompt_embeds
|
||||
pipeline._guidance_scale = 0.0
|
||||
callback_kwargs["prompt_embeds"] = prompt_embeds
|
||||
return callback_kwargs
|
||||
```
|
||||
|
||||
@@ -43,58 +48,134 @@ Now, you can pass the callback function to the `callback_on_step_end` parameter
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
pipeline = pipeline.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
|
||||
generator = torch.Generator(device="cuda").manual_seed(1)
|
||||
out = pipe(prompt, generator=generator, callback_on_step_end=callback_dynamic_cfg, callback_on_step_end_tensor_inputs=['prompt_embeds'])
|
||||
out = pipeline(
|
||||
prompt,
|
||||
generator=generator,
|
||||
callback_on_step_end=callback_dynamic_cfg,
|
||||
callback_on_step_end_tensor_inputs=['prompt_embeds']
|
||||
)
|
||||
|
||||
out.images[0].save("out_custom_cfg.png")
|
||||
```
|
||||
|
||||
The callback function is executed at the end of each denoising step, and modifies the pipeline attributes and tensor variables for the next denoising step.
|
||||
|
||||
With callbacks, you can implement features such as dynamic CFG without having to modify the underlying code at all!
|
||||
|
||||
<Tip>
|
||||
|
||||
🤗 Diffusers currently only supports `callback_on_step_end`, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require a callback function with a different execution point!
|
||||
|
||||
</Tip>
|
||||
|
||||
## Interrupt the diffusion process
|
||||
|
||||
Interrupting the diffusion process is particularly useful when building UIs that work with Diffusers because it allows users to stop the generation process if they're unhappy with the intermediate results. You can incorporate this into your pipeline with a callback.
|
||||
> [!TIP]
|
||||
> The interruption callback is supported for text-to-image, image-to-image, and inpainting for the [StableDiffusionPipeline](../api/pipelines/stable_diffusion/overview) and [StableDiffusionXLPipeline](../api/pipelines/stable_diffusion/stable_diffusion_xl).
|
||||
|
||||
<Tip>
|
||||
Stopping the diffusion process early is useful when building UIs that work with Diffusers because it allows users to stop the generation process if they're unhappy with the intermediate results. You can incorporate this into your pipeline with a callback.
|
||||
|
||||
The interruption callback is supported for text-to-image, image-to-image, and inpainting for the [StableDiffusionPipeline](../api/pipelines/stable_diffusion/overview) and [StableDiffusionXLPipeline](../api/pipelines/stable_diffusion/stable_diffusion_xl).
|
||||
|
||||
</Tip>
|
||||
|
||||
This callback function should take the following arguments: `pipe`, `i`, `t`, and `callback_kwargs` (this must be returned). Set the pipeline's `_interrupt` attribute to `True` to stop the diffusion process after a certain number of steps. You are also free to implement your own custom stopping logic inside the callback.
|
||||
This callback function should take the following arguments: `pipeline`, `i`, `t`, and `callback_kwargs` (this must be returned). Set the pipeline's `_interrupt` attribute to `True` to stop the diffusion process after a certain number of steps. You are also free to implement your own custom stopping logic inside the callback.
|
||||
|
||||
In this example, the diffusion process is stopped after 10 steps even though `num_inference_steps` is set to 50.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
pipe.enable_model_cpu_offload()
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
num_inference_steps = 50
|
||||
|
||||
def interrupt_callback(pipe, i, t, callback_kwargs):
|
||||
def interrupt_callback(pipeline, i, t, callback_kwargs):
|
||||
stop_idx = 10
|
||||
if i == stop_idx:
|
||||
pipe._interrupt = True
|
||||
pipeline._interrupt = True
|
||||
|
||||
return callback_kwargs
|
||||
|
||||
pipe(
|
||||
pipeline(
|
||||
"A photo of a cat",
|
||||
num_inference_steps=num_inference_steps,
|
||||
callback_on_step_end=interrupt_callback,
|
||||
)
|
||||
```
|
||||
|
||||
## Display image after each generation step
|
||||
|
||||
> [!TIP]
|
||||
> This tip was contributed by [asomoza](https://github.com/asomoza).
|
||||
|
||||
Display an image after each generation step by accessing and converting the latents after each step into an image. The latent space is compressed to 128x128, so the images are also 128x128 which is useful for a quick preview.
|
||||
|
||||
1. Use the function below to convert the SDXL latents (4 channels) to RGB tensors (3 channels) as explained in the [Explaining the SDXL latent space](https://huggingface.co/blog/TimothyAlexisVass/explaining-the-sdxl-latent-space) blog post.
|
||||
|
||||
```py
|
||||
def latents_to_rgb(latents):
|
||||
weights = (
|
||||
(60, -60, 25, -70),
|
||||
(60, -5, 15, -50),
|
||||
(60, 10, -5, -35)
|
||||
)
|
||||
|
||||
weights_tensor = torch.t(torch.tensor(weights, dtype=latents.dtype).to(latents.device))
|
||||
biases_tensor = torch.tensor((150, 140, 130), dtype=latents.dtype).to(latents.device)
|
||||
rgb_tensor = torch.einsum("...lxy,lr -> ...rxy", latents, weights_tensor) + biases_tensor.unsqueeze(-1).unsqueeze(-1)
|
||||
image_array = rgb_tensor.clamp(0, 255)[0].byte().cpu().numpy()
|
||||
image_array = image_array.transpose(1, 2, 0)
|
||||
|
||||
return Image.fromarray(image_array)
|
||||
```
|
||||
|
||||
2. Create a function to decode and save the latents into an image.
|
||||
|
||||
```py
|
||||
def decode_tensors(pipe, step, timestep, callback_kwargs):
|
||||
latents = callback_kwargs["latents"]
|
||||
|
||||
image = latents_to_rgb(latents)
|
||||
image.save(f"{step}.png")
|
||||
|
||||
return callback_kwargs
|
||||
```
|
||||
|
||||
3. Pass the `decode_tensors` function to the `callback_on_step_end` parameter to decode the tensors after each step. You also need to specify what you want to modify in the `callback_on_step_end_tensor_inputs` parameter, which in this case are the latents.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True
|
||||
).to("cuda")
|
||||
|
||||
image = pipeline(
|
||||
prompt="A croissant shaped like a cute bear.",
|
||||
negative_prompt="Deformed, ugly, bad anatomy",
|
||||
callback_on_step_end=decode_tensors,
|
||||
callback_on_step_end_tensor_inputs=["latents"],
|
||||
).images[0]
|
||||
```
|
||||
|
||||
<div class="flex gap-4 justify-center">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_0.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 0</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_19.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 19
|
||||
</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_29.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 29</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_39.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 39</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_49.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 49</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -1,184 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Contribute a community pipeline
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
|
||||
|
||||
</Tip>
|
||||
|
||||
Community pipelines allow you to add any additional features you'd like on top of the [`DiffusionPipeline`]. The main benefit of building on top of the `DiffusionPipeline` is anyone can load and use your pipeline by only adding one more argument, making it super easy for the community to access.
|
||||
|
||||
This guide will show you how to create a community pipeline and explain how they work. To keep things simple, you'll create a "one-step" pipeline where the `UNet` does a single forward pass and calls the scheduler once.
|
||||
|
||||
## Initialize the pipeline
|
||||
|
||||
You should start by creating a `one_step_unet.py` file for your community pipeline. In this file, create a pipeline class that inherits from the [`DiffusionPipeline`] to be able to load model weights and the scheduler configuration from the Hub. The one-step pipeline needs a `UNet` and a scheduler, so you'll need to add these as arguments to the `__init__` function:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
```
|
||||
|
||||
To ensure your pipeline and its components (`unet` and `scheduler`) can be saved with [`~DiffusionPipeline.save_pretrained`], add them to the `register_modules` function:
|
||||
|
||||
```diff
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
+ self.register_modules(unet=unet, scheduler=scheduler)
|
||||
```
|
||||
|
||||
Cool, the `__init__` step is done and you can move to the forward pass now! 🔥
|
||||
|
||||
## Define the forward pass
|
||||
|
||||
In the forward pass, which we recommend defining as `__call__`, you have complete creative freedom to add whatever feature you'd like. For our amazing one-step pipeline, create a random image and only call the `unet` and `scheduler` once by setting `timestep=1`:
|
||||
|
||||
```diff
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
|
||||
+ def __call__(self):
|
||||
+ image = torch.randn(
|
||||
+ (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
|
||||
+ )
|
||||
+ timestep = 1
|
||||
|
||||
+ model_output = self.unet(image, timestep).sample
|
||||
+ scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
|
||||
|
||||
+ return scheduler_output
|
||||
```
|
||||
|
||||
That's it! 🚀 You can now run this pipeline by passing a `unet` and `scheduler` to it:
|
||||
|
||||
```python
|
||||
from diffusers import DDPMScheduler, UNet2DModel
|
||||
|
||||
scheduler = DDPMScheduler()
|
||||
unet = UNet2DModel()
|
||||
|
||||
pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
|
||||
|
||||
output = pipeline()
|
||||
```
|
||||
|
||||
But what's even better is you can load pre-existing weights into the pipeline if the pipeline structure is identical. For example, you can load the [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32) weights into the one-step pipeline:
|
||||
|
||||
```python
|
||||
pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
|
||||
output = pipeline()
|
||||
```
|
||||
|
||||
## Share your pipeline
|
||||
|
||||
Open a Pull Request on the 🧨 Diffusers [repository](https://github.com/huggingface/diffusers) to add your awesome pipeline in `one_step_unet.py` to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder.
|
||||
|
||||
Once it is merged, anyone with `diffusers >= 0.4.0` installed can use this pipeline magically 🪄 by specifying it in the `custom_pipeline` argument:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"google/ddpm-cifar10-32", custom_pipeline="one_step_unet", use_safetensors=True
|
||||
)
|
||||
pipe()
|
||||
```
|
||||
|
||||
Another way to share your community pipeline is to upload the `one_step_unet.py` file directly to your preferred [model repository](https://huggingface.co/docs/hub/models-uploading) on the Hub. Instead of specifying the `one_step_unet.py` file, pass the model repository id to the `custom_pipeline` argument:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"google/ddpm-cifar10-32", custom_pipeline="stevhliu/one_step_unet", use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
Take a look at the following table to compare the two sharing workflows to help you decide the best option for you:
|
||||
|
||||
| | GitHub community pipeline | HF Hub community pipeline |
|
||||
|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
|
||||
| usage | same | same |
|
||||
| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow |
|
||||
| visibility | included in the official Diffusers repository and documentation | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` because this is automatically detected.
|
||||
|
||||
</Tip>
|
||||
|
||||
## How do community pipelines work?
|
||||
|
||||
A community pipeline is a class that inherits from [`DiffusionPipeline`] which means:
|
||||
|
||||
- It can be loaded with the [`custom_pipeline`] argument.
|
||||
- The model weights and scheduler configuration are loaded from [`pretrained_model_name_or_path`].
|
||||
- The code that implements a feature in the community pipeline is defined in a `pipeline.py` file.
|
||||
|
||||
Sometimes you can't load all the pipeline components weights from an official repository. In this case, the other components should be passed directly to the pipeline:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
from transformers import CLIPImageProcessor, CLIPModel
|
||||
|
||||
model_id = "CompVis/stable-diffusion-v1-4"
|
||||
clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
|
||||
|
||||
feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id)
|
||||
clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
model_id,
|
||||
custom_pipeline="clip_guided_stable_diffusion",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
scheduler=scheduler,
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
)
|
||||
```
|
||||
|
||||
The magic behind community pipelines is contained in the following code. It allows the community pipeline to be loaded from GitHub or the Hub, and it'll be available to all 🧨 Diffusers packages.
|
||||
|
||||
```python
|
||||
# 2. Load the pipeline class, if using custom module then load it from the Hub
|
||||
# if we load from explicit class, let's use it
|
||||
if custom_pipeline is not None:
|
||||
pipeline_class = get_class_from_dynamic_module(
|
||||
custom_pipeline, module_file=CUSTOM_PIPELINE_FILE_NAME, cache_dir=custom_pipeline
|
||||
)
|
||||
elif cls != DiffusionPipeline:
|
||||
pipeline_class = cls
|
||||
else:
|
||||
diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
|
||||
pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
|
||||
```
|
||||
@@ -1,58 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Control image brightness
|
||||
|
||||
The Stable Diffusion pipeline is mediocre at generating images that are either very bright or dark as explained in the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) paper. The solutions proposed in the paper are currently implemented in the [`DDIMScheduler`] which you can use to improve the lighting in your images.
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 Take a look at the paper linked above for more details about the proposed solutions!
|
||||
|
||||
</Tip>
|
||||
|
||||
One of the solutions is to train a model with *v prediction* and *v loss*. Add the following flag to the [`train_text_to_image.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [`train_text_to_image_lora.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts to enable `v_prediction`:
|
||||
|
||||
```bash
|
||||
--prediction_type="v_prediction"
|
||||
```
|
||||
|
||||
For example, let's use the [`ptx0/pseudo-journey-v2`](https://huggingface.co/ptx0/pseudo-journey-v2) checkpoint which has been finetuned with `v_prediction`.
|
||||
|
||||
Next, configure the following parameters in the [`DDIMScheduler`]:
|
||||
|
||||
1. `rescale_betas_zero_snr=True`, rescales the noise schedule to zero terminal signal-to-noise ratio (SNR)
|
||||
2. `timestep_spacing="trailing"`, starts sampling from the last timestep
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
|
||||
|
||||
# switch the scheduler in the pipeline to use the DDIMScheduler
|
||||
pipeline.scheduler = DDIMScheduler.from_config(
|
||||
pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
```
|
||||
|
||||
Finally, in your call to the pipeline, set `guidance_rescale` to prevent overexposure:
|
||||
|
||||
```py
|
||||
prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
|
||||
image = pipeline(prompt, guidance_rescale=0.7).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero_snr.png"/>
|
||||
</div>
|
||||
@@ -429,6 +429,27 @@ image = pipe(
|
||||
make_image_grid([original_image, canny_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
You can use a refiner model with `StableDiffusionXLControlNetPipeline` to improve image quality, just like you can with a regular `StableDiffusionXLPipeline`.
|
||||
See the [Refine image quality](./sdxl#refine-image-quality) section to learn how to use the refiner model.
|
||||
Make sure to use `StableDiffusionXLControlNetPipeline` and pass `image` and `controlnet_conditioning_scale`.
|
||||
|
||||
```py
|
||||
base = StableDiffusionXLControlNetPipeline(...)
|
||||
image = base(
|
||||
prompt=prompt,
|
||||
controlnet_conditioning_scale=0.5,
|
||||
image=canny_image,
|
||||
num_inference_steps=40,
|
||||
denoising_end=0.8,
|
||||
output_type="latent",
|
||||
).images
|
||||
# rest exactly as with StableDiffusionXLPipeline
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
## MultiControlNet
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -1,119 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Community pipelines
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
<Tip>
|
||||
|
||||
For more context about the design choices behind community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).
|
||||
|
||||
</Tip>
|
||||
|
||||
Community pipelines allow you to get creative and build your own unique pipelines to share with the community. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder along with inference and training examples for how to use them. This guide showcases some of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR with your own pipeline and we will merge it!).
|
||||
|
||||
To load a community pipeline, use the `custom_pipeline` argument in [`DiffusionPipeline`] to specify one of the files in [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community):
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder", use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
If a community pipeline doesn't work as expected, please open a GitHub issue and mention the author.
|
||||
|
||||
You can learn more about community pipelines in the how to [load community pipelines](custom_pipeline_overview) and how to [contribute a community pipeline](contribute_pipeline) guides.
|
||||
|
||||
## Multilingual Stable Diffusion
|
||||
|
||||
The multilingual Stable Diffusion pipeline uses a pretrained [XLM-RoBERTa](https://huggingface.co/papluca/xlm-roberta-base-language-detection) to identify a language and the [mBART-large-50](https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt) model to handle the translation. This allows you to generate images from text in 20 languages.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import make_image_grid
|
||||
from transformers import (
|
||||
pipeline,
|
||||
MBart50TokenizerFast,
|
||||
MBartForConditionalGeneration,
|
||||
)
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
device_dict = {"cuda": 0, "cpu": -1}
|
||||
|
||||
# add language detection pipeline
|
||||
language_detection_model_ckpt = "papluca/xlm-roberta-base-language-detection"
|
||||
language_detection_pipeline = pipeline("text-classification",
|
||||
model=language_detection_model_ckpt,
|
||||
device=device_dict[device])
|
||||
|
||||
# add model for language translation
|
||||
translation_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
|
||||
translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt").to(device)
|
||||
|
||||
diffuser_pipeline = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="multilingual_stable_diffusion",
|
||||
detection_pipeline=language_detection_pipeline,
|
||||
translation_model=translation_model,
|
||||
translation_tokenizer=translation_tokenizer,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
diffuser_pipeline.enable_attention_slicing()
|
||||
diffuser_pipeline = diffuser_pipeline.to(device)
|
||||
|
||||
prompt = ["a photograph of an astronaut riding a horse",
|
||||
"Una casa en la playa",
|
||||
"Ein Hund, der Orange isst",
|
||||
"Un restaurant parisien"]
|
||||
|
||||
images = diffuser_pipeline(prompt).images
|
||||
make_image_grid(images, rows=2, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://user-images.githubusercontent.com/4313860/198328706-295824a4-9856-4ce5-8e66-278ceb42fd29.png"/>
|
||||
</div>
|
||||
|
||||
## MagicMix
|
||||
|
||||
[MagicMix](https://huggingface.co/papers/2210.16056) is a pipeline that can mix an image and text prompt to generate a new image that preserves the image structure. The `mix_factor` determines how much influence the prompt has on the layout generation, `kmin` controls the number of steps during the content generation process, and `kmax` determines how much information is kept in the layout of the original image.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="magic_mix",
|
||||
scheduler=DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
|
||||
).to('cuda')
|
||||
|
||||
img = load_image("https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg")
|
||||
mix_img = pipeline(img, prompt="bed", kmin=0.3, kmax=0.5, mix_factor=0.5)
|
||||
make_image_grid([img, mix_img], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://user-images.githubusercontent.com/59410571/209578602-70f323fa-05b7-4dd6-b055-e40683e37914.jpg" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">image and text prompt mix</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
@@ -16,17 +16,27 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
## Community pipelines
|
||||
|
||||
Community pipelines are any [`DiffusionPipeline`] class that are different from the original implementation as specified in their paper (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline.
|
||||
> [!TIP] Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
|
||||
|
||||
There are many cool community pipelines like [Speech to Image](https://github.com/huggingface/diffusers/tree/main/examples/community#speech-to-image) or [Composable Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#composable-stable-diffusion), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community).
|
||||
Community pipelines are any [`DiffusionPipeline`] class that are different from the original paper implementation (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline.
|
||||
|
||||
To load any community pipeline on the Hub, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you'd like to load the pipeline weights and components from. For example, the example below loads a dummy pipeline from [`hf-internal-testing/diffusers-dummy-pipeline`](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py) and the pipeline weights and components from [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32):
|
||||
There are many cool community pipelines like [Marigold Depth Estimation](https://github.com/huggingface/diffusers/tree/main/examples/community#marigold-depth-estimation) or [InstantID](https://github.com/huggingface/diffusers/tree/main/examples/community#instantid-pipeline), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community).
|
||||
|
||||
<Tip warning={true}>
|
||||
There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code.
|
||||
|
||||
🔒 By loading a community pipeline from the Hugging Face Hub, you are trusting that the code you are loading is safe. Make sure to inspect the code online before loading and running it automatically!
|
||||
| | GitHub community pipeline | HF Hub community pipeline |
|
||||
|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
|
||||
| usage | same | same |
|
||||
| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow |
|
||||
| visibility | included in the official Diffusers repository and documentation | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
|
||||
|
||||
</Tip>
|
||||
<hfoptions id="community">
|
||||
<hfoption id="Hub pipelines">
|
||||
|
||||
To load a Hugging Face Hub community pipeline, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you'd like to load the pipeline weights and components from. For example, the example below loads a dummy pipeline from [hf-internal-testing/diffusers-dummy-pipeline](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py) and the pipeline weights and components from [google/ddpm-cifar10-32](https://huggingface.co/google/ddpm-cifar10-32):
|
||||
|
||||
> [!WARNING]
|
||||
> By loading a community pipeline from the Hugging Face Hub, you are trusting that the code you are loading is safe. Make sure to inspect the code online before loading and running it automatically!
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -36,7 +46,10 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
)
|
||||
```
|
||||
|
||||
Loading an official community pipeline is similar, but you can mix loading weights from an official repository id and pass pipeline components directly. The example below loads the community [CLIP Guided Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#clip-guided-stable-diffusion) pipeline, and you can pass the CLIP model components directly to it:
|
||||
</hfoption>
|
||||
<hfoption id="GitHub pipelines">
|
||||
|
||||
To load a GitHub community pipeline, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you you'd like to load the pipeline weights and components from. You can also load model components directly. The example below loads the community [CLIP Guided Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#clip-guided-stable-diffusion) pipeline and the CLIP model components.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -56,9 +69,12 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### Load from a local file
|
||||
|
||||
Community pipelines can also be loaded from a local file if you pass a file path instead. The path to the passed directory must contain a `pipeline.py` file that contains the pipeline class in order to successfully load it.
|
||||
Community pipelines can also be loaded from a local file if you pass a file path instead. The path to the passed directory must contain a pipeline.py file that contains the pipeline class.
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
@@ -77,7 +93,7 @@ By default, community pipelines are loaded from the latest stable version of Dif
|
||||
<hfoptions id="version">
|
||||
<hfoption id="main">
|
||||
|
||||
For example, to load from the `main` branch:
|
||||
For example, to load from the main branch:
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
@@ -93,7 +109,7 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
</hfoption>
|
||||
<hfoption id="older version">
|
||||
|
||||
For example, to load from a previous version of Diffusers like `v0.25.0`:
|
||||
For example, to load from a previous version of Diffusers like v0.25.0:
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
@@ -109,8 +125,140 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### Load with from_pipe
|
||||
|
||||
For more information about community pipelines, take a look at the [Community pipelines](custom_pipeline_examples) guide for how to use them and if you're interested in adding a community pipeline check out the [How to contribute a community pipeline](contribute_pipeline) guide!
|
||||
Community pipelines can also be loaded with the [`~DiffusionPipeline.from_pipe`] method which allows you to load and reuse multiple pipelines without any additional memory overhead (learn more in the [Reuse a pipeline](./loading#reuse-a-pipeline) guide). The memory requirement is determined by the largest single pipeline loaded.
|
||||
|
||||
For example, let's load a community pipeline that supports [long prompts with weighting](https://github.com/huggingface/diffusers/tree/main/examples/community#long-prompt-weighting-stable-diffusion) from a Stable Diffusion pipeline.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe_sd = DiffusionPipeline.from_pretrained("emilianJR/CyberRealistic_V3", torch_dtype=torch.float16)
|
||||
pipe_sd.to("cuda")
|
||||
# load long prompt weighting pipeline
|
||||
pipe_lpw = DiffusionPipeline.from_pipe(
|
||||
pipe_sd,
|
||||
custom_pipeline="lpw_stable_diffusion",
|
||||
).to("cuda")
|
||||
|
||||
prompt = "cat, hiding in the leaves, ((rain)), zazie rainyday, beautiful eyes, macro shot, colorful details, natural lighting, amazing composition, subsurface scattering, amazing textures, filmic, soft light, ultra-detailed eyes, intricate details, detailed texture, light source contrast, dramatic shadows, cinematic light, depth of field, film grain, noise, dark background, hyperrealistic dslr film still, dim volumetric cinematic lighting"
|
||||
neg_prompt = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers:1.4), (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
|
||||
generator = torch.Generator(device="cpu").manual_seed(20)
|
||||
out_lpw = pipe_lpw(
|
||||
prompt,
|
||||
negative_prompt=neg_prompt,
|
||||
width=512,
|
||||
height=512,
|
||||
max_embeddings_multiples=3,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
out_lpw
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/from_pipe_lpw.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Stable Diffusion with long prompt weighting</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/from_pipe_non_lpw.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Stable Diffusion</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Example community pipelines
|
||||
|
||||
Community pipelines are a really fun and creative way to extend the capabilities of the original pipeline with new and unique features. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder with inference and training examples for how to use them.
|
||||
|
||||
This section showcases a couple of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR for your community pipeline and ping us for a review)!
|
||||
|
||||
> [!TIP]
|
||||
> The [`~DiffusionPipeline.from_pipe`] method is particularly useful for loading community pipelines because many of them don't have pretrained weights and add a feature on top of an existing pipeline like Stable Diffusion or Stable Diffusion XL. You can learn more about the [`~DiffusionPipeline.from_pipe`] method in the [Load with from_pipe](custom_pipeline_overview#load-with-from_pipe) section.
|
||||
|
||||
<hfoptions id="community">
|
||||
<hfoption id="Marigold">
|
||||
|
||||
[Marigold](https://marigoldmonodepth.github.io/) is a depth estimation diffusion pipeline that uses the rich existing and inherent visual knowledge in diffusion models. It takes an input image and denoises and decodes it into a depth map. Marigold performs well even on images it hasn't seen before.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"prs-eth/marigold-lcm-v1-0",
|
||||
custom_pipeline="marigold_depth_estimation",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
)
|
||||
|
||||
pipeline.to("cuda")
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/community-marigold.png")
|
||||
output = pipeline(
|
||||
image,
|
||||
denoising_steps=4,
|
||||
ensemble_size=5,
|
||||
processing_res=768,
|
||||
match_input_res=True,
|
||||
batch_size=0,
|
||||
seed=33,
|
||||
color_map="Spectral",
|
||||
show_progress_bar=True,
|
||||
)
|
||||
depth_colored: Image.Image = output.depth_colored
|
||||
depth_colored.save("./depth_colored.png")
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/community-marigold.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/marigold-depth.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">colorized depth image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="HD-Painter">
|
||||
|
||||
[HD-Painter](https://hf.co/papers/2312.14091) is a high-resolution inpainting pipeline. It introduces a *Prompt-Aware Introverted Attention (PAIntA)* layer to better align a prompt with the area to be inpainted, and *Reweighting Attention Score Guidance (RASG)* to keep the latents more prompt-aligned and within their trained domain to generate realistc images.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"Lykon/dreamshaper-8-inpainting",
|
||||
custom_pipeline="hd_painter"
|
||||
)
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter.jpg")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter-mask.png")
|
||||
prompt = "football"
|
||||
image = pipeline(prompt, init_image, mask_image, use_rasg=True, use_painta=True, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter.jpg"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter-output.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Community components
|
||||
|
||||
@@ -118,7 +266,7 @@ Community components allow users to build pipelines that may have customized com
|
||||
|
||||
This section shows how users should use community components to build a community pipeline.
|
||||
|
||||
You'll use the [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) pipeline checkpoint as an example. So, let's start loading the components:
|
||||
You'll use the [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) pipeline checkpoint as an example.
|
||||
|
||||
1. Import and load the text encoder from Transformers:
|
||||
|
||||
@@ -152,17 +300,17 @@ In steps 4 and 5, the custom [UNet](https://github.com/showlab/Show-1/blob/main/
|
||||
|
||||
</Tip>
|
||||
|
||||
4. Now you'll load a [custom UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py), which in this example, has already been implemented in the `showone_unet_3d_condition.py` [script](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) for your convenience. You'll notice the `UNet3DConditionModel` class name is changed to `ShowOneUNet3DConditionModel` because [`UNet3DConditionModel`] already exists in Diffusers. Any components needed for the `ShowOneUNet3DConditionModel` class should be placed in the `showone_unet_3d_condition.py` script.
|
||||
4. Now you'll load a [custom UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py), which in this example, has already been implemented in [showone_unet_3d_condition.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) for your convenience. You'll notice the [`UNet3DConditionModel`] class name is changed to `ShowOneUNet3DConditionModel` because [`UNet3DConditionModel`] already exists in Diffusers. Any components needed for the `ShowOneUNet3DConditionModel` class should be placed in showone_unet_3d_condition.py.
|
||||
|
||||
Once this is done, you can initialize the UNet:
|
||||
Once this is done, you can initialize the UNet:
|
||||
|
||||
```python
|
||||
from showone_unet_3d_condition import ShowOneUNet3DConditionModel
|
||||
```python
|
||||
from showone_unet_3d_condition import ShowOneUNet3DConditionModel
|
||||
|
||||
unet = ShowOneUNet3DConditionModel.from_pretrained(pipe_id, subfolder="unet")
|
||||
```
|
||||
unet = ShowOneUNet3DConditionModel.from_pretrained(pipe_id, subfolder="unet")
|
||||
```
|
||||
|
||||
5. Finally, you'll load the custom pipeline code. For this example, it has already been created for you in the `pipeline_t2v_base_pixel.py` [script](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/pipeline_t2v_base_pixel.py). This script contains a custom `TextToVideoIFPipeline` class for generating videos from text. Just like the custom UNet, any code needed for the custom pipeline to work should go in the `pipeline_t2v_base_pixel.py` script.
|
||||
5. Finally, you'll load the custom pipeline code. For this example, it has already been created for you in [pipeline_t2v_base_pixel.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/pipeline_t2v_base_pixel.py). This script contains a custom `TextToVideoIFPipeline` class for generating videos from text. Just like the custom UNet, any code needed for the custom pipeline to work should go in pipeline_t2v_base_pixel.py.
|
||||
|
||||
Once everything is in place, you can initialize the `TextToVideoIFPipeline` with the `ShowOneUNet3DConditionModel`:
|
||||
|
||||
@@ -187,13 +335,16 @@ Push the pipeline to the Hub to share with the community!
|
||||
pipeline.push_to_hub("custom-t2v-pipeline")
|
||||
```
|
||||
|
||||
After the pipeline is successfully pushed, you need a couple of changes:
|
||||
After the pipeline is successfully pushed, you need to make a few changes:
|
||||
|
||||
1. Change the `_class_name` attribute in [`model_index.json`](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2) to `"pipeline_t2v_base_pixel"` and `"TextToVideoIFPipeline"`.
|
||||
2. Upload `showone_unet_3d_condition.py` to the `unet` [directory](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py).
|
||||
3. Upload `pipeline_t2v_base_pixel.py` to the pipeline base [directory](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py).
|
||||
1. Change the `_class_name` attribute in [model_index.json](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2) to `"pipeline_t2v_base_pixel"` and `"TextToVideoIFPipeline"`.
|
||||
2. Upload `showone_unet_3d_condition.py` to the [unet](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) subfolder.
|
||||
3. Upload `pipeline_t2v_base_pixel.py` to the pipeline [repository](https://huggingface.co/sayakpaul/show-1-base-with-code/tree/main).
|
||||
|
||||
To run inference, simply add the `trust_remote_code` argument while initializing the pipeline to handle all the "magic" behind the scenes.
|
||||
To run inference, add the `trust_remote_code` argument while initializing the pipeline to handle all the "magic" behind the scenes.
|
||||
|
||||
> [!WARNING]
|
||||
> As an additional precaution with `trust_remote_code=True`, we strongly encourage you to pass a commit hash to the `revision` parameter in [`~DiffusionPipeline.from_pretrained`] to make sure the code hasn't been updated with some malicious new lines of code (unless you fully trust the model owners).
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -221,10 +372,9 @@ video_frames = pipeline(
|
||||
).frames
|
||||
```
|
||||
|
||||
As an additional reference example, you can refer to the repository structure of [stabilityai/japanese-stable-diffusion-xl](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl/), that makes use of the `trust_remote_code` feature:
|
||||
As an additional reference, take a look at the repository structure of [stabilityai/japanese-stable-diffusion-xl](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl/) which also uses the `trust_remote_code` feature.
|
||||
|
||||
```python
|
||||
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
@@ -232,12 +382,4 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/japanese-stable-diffusion-xl", trust_remote_code=True
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
|
||||
# if using torch < 2.0
|
||||
# pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
prompt = "柴犬、カラフルアート"
|
||||
|
||||
image = pipeline(prompt=prompt).images[0]
|
||||
|
||||
```
|
||||
```
|
||||
|
||||
@@ -1,133 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Distilled Stable Diffusion inference
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Stable Diffusion inference can be a computationally intensive process because it must iteratively denoise the latents to generate an image. To reduce the computational burden, you can use a *distilled* version of the Stable Diffusion model from [Nota AI](https://huggingface.co/nota-ai). The distilled version of their Stable Diffusion model eliminates some of the residual and attention blocks from the UNet, reducing the model size by 51% and improving latency on CPU/GPU by 43%.
|
||||
|
||||
<Tip>
|
||||
|
||||
Read this [blog post](https://huggingface.co/blog/sd_distillation) to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
|
||||
|
||||
</Tip>
|
||||
|
||||
Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
|
||||
distilled = StableDiffusionPipeline.from_pretrained(
|
||||
"nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
|
||||
).to("cuda")
|
||||
|
||||
original = StableDiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, use_safetensors=True,
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Given a prompt, get the inference time for the original model:
|
||||
|
||||
```py
|
||||
import time
|
||||
|
||||
seed = 2023
|
||||
generator = torch.manual_seed(seed)
|
||||
|
||||
NUM_ITERS_TO_RUN = 3
|
||||
NUM_INFERENCE_STEPS = 25
|
||||
NUM_IMAGES_PER_PROMPT = 4
|
||||
|
||||
prompt = "a golden vase with different flowers"
|
||||
|
||||
start = time.time_ns()
|
||||
for _ in range(NUM_ITERS_TO_RUN):
|
||||
images = original(
|
||||
prompt,
|
||||
num_inference_steps=NUM_INFERENCE_STEPS,
|
||||
generator=generator,
|
||||
num_images_per_prompt=NUM_IMAGES_PER_PROMPT
|
||||
).images
|
||||
end = time.time_ns()
|
||||
original_sd = f"{(end - start) / 1e6:.1f}"
|
||||
|
||||
print(f"Execution time -- {original_sd} ms\n")
|
||||
"Execution time -- 45781.5 ms"
|
||||
```
|
||||
|
||||
Time the distilled model inference:
|
||||
|
||||
```py
|
||||
start = time.time_ns()
|
||||
for _ in range(NUM_ITERS_TO_RUN):
|
||||
images = distilled(
|
||||
prompt,
|
||||
num_inference_steps=NUM_INFERENCE_STEPS,
|
||||
generator=generator,
|
||||
num_images_per_prompt=NUM_IMAGES_PER_PROMPT
|
||||
).images
|
||||
end = time.time_ns()
|
||||
|
||||
distilled_sd = f"{(end - start) / 1e6:.1f}"
|
||||
print(f"Execution time -- {distilled_sd} ms\n")
|
||||
"Execution time -- 29884.2 ms"
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion (45781.5 ms)</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion (29884.2 ms)</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Tiny AutoEncoder
|
||||
|
||||
To speed inference up even more, use a tiny distilled version of the [Stable Diffusion VAE](https://huggingface.co/sayakpaul/taesdxl-diffusers) to denoise the latents into images. Replace the VAE in the distilled Stable Diffusion model with the tiny VAE:
|
||||
|
||||
```py
|
||||
from diffusers import AutoencoderTiny
|
||||
|
||||
distilled.vae = AutoencoderTiny.from_pretrained(
|
||||
"sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Time the distilled model and distilled VAE inference:
|
||||
|
||||
```py
|
||||
start = time.time_ns()
|
||||
for _ in range(NUM_ITERS_TO_RUN):
|
||||
images = distilled(
|
||||
prompt,
|
||||
num_inference_steps=NUM_INFERENCE_STEPS,
|
||||
generator=generator,
|
||||
num_images_per_prompt=NUM_IMAGES_PER_PROMPT
|
||||
).images
|
||||
end = time.time_ns()
|
||||
|
||||
distilled_tiny_sd = f"{(end - start) / 1e6:.1f}"
|
||||
print(f"Execution time -- {distilled_tiny_sd} ms\n")
|
||||
"Execution time -- 27165.7 ms"
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder (27165.7 ms)</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
@@ -1,135 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Improve generation quality with FreeU
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
The UNet is responsible for denoising during the reverse diffusion process, and there are two distinct features in its architecture:
|
||||
|
||||
1. Backbone features primarily contribute to the denoising process
|
||||
2. Skip features mainly introduce high-frequency features into the decoder module and can make the network overlook the semantics in the backbone features
|
||||
|
||||
However, the skip connection can sometimes introduce unnatural image details. [FreeU](https://hf.co/papers/2309.11497) is a technique for improving image quality by rebalancing the contributions from the UNet’s skip connections and backbone feature maps.
|
||||
|
||||
FreeU is applied during inference and it does not require any additional training. The technique works for different tasks such as text-to-image, image-to-image, and text-to-video.
|
||||
|
||||
In this guide, you will apply FreeU to the [`StableDiffusionPipeline`], [`StableDiffusionXLPipeline`], and [`TextToVideoSDPipeline`]. You need to install Diffusers from source to run the examples below.
|
||||
|
||||
## StableDiffusionPipeline
|
||||
|
||||
Load the pipeline:
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Then enable the FreeU mechanism with the FreeU-specific hyperparameters. These values are scaling factors for the backbone and skip features.
|
||||
|
||||
```py
|
||||
pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
|
||||
```
|
||||
|
||||
The values above are from the official FreeU [code repository](https://github.com/ChenyangSi/FreeU) where you can also find [reference hyperparameters](https://github.com/ChenyangSi/FreeU#range-for-more-parameters) for different models.
|
||||
|
||||
<Tip>
|
||||
|
||||
Disable the FreeU mechanism by calling `disable_freeu()` on a pipeline.
|
||||
|
||||
</Tip>
|
||||
|
||||
And then run inference:
|
||||
|
||||
```py
|
||||
prompt = "A squirrel eating a burger"
|
||||
seed = 2023
|
||||
image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
The figure below compares non-FreeU and FreeU results respectively for the same hyperparameters used above (`prompt` and `seed`):
|
||||
|
||||

|
||||
|
||||
|
||||
Let's see how Stable Diffusion 2 results are impacted:
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, safety_checker=None
|
||||
).to("cuda")
|
||||
|
||||
prompt = "A squirrel eating a burger"
|
||||
seed = 2023
|
||||
|
||||
pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.1, b2=1.2)
|
||||
image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Stable Diffusion XL
|
||||
|
||||
Finally, let's take a look at how FreeU affects Stable Diffusion XL results:
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
prompt = "A squirrel eating a burger"
|
||||
seed = 2023
|
||||
|
||||
# Comes from
|
||||
# https://wandb.ai/nasirk24/UNET-FreeU-SDXL/reports/FreeU-SDXL-Optimal-Parameters--Vmlldzo1NDg4NTUw
|
||||
pipeline.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
|
||||
image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Text-to-video generation
|
||||
|
||||
FreeU can also be used to improve video quality:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import export_to_video
|
||||
import torch
|
||||
|
||||
model_id = "cerspense/zeroscope_v2_576w"
|
||||
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
prompt = "an astronaut riding a horse on mars"
|
||||
seed = 2023
|
||||
|
||||
# The values come from
|
||||
# https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines
|
||||
pipe.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2)
|
||||
video_frames = pipe(prompt, height=320, width=576, num_frames=30, generator=torch.manual_seed(seed)).frames
|
||||
export_to_video(video_frames, "astronaut_rides_horse.mp4")
|
||||
```
|
||||
|
||||
Thanks to [kadirnar](https://github.com/kadirnar/) for helping to integrate the feature, and to [justindujardin](https://github.com/justindujardin) for the helpful discussions.
|
||||
190
docs/source/en/using-diffusers/image_quality.md
Normal file
190
docs/source/en/using-diffusers/image_quality.md
Normal file
@@ -0,0 +1,190 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Controlling image quality
|
||||
|
||||
The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better image lighting and details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training.
|
||||
|
||||
This guide will show you how to turn these techniques on in your pipeline and how to configure them to improve the quality of your generated images.
|
||||
|
||||
## Lighting
|
||||
|
||||
The Stable Diffusion models aren't very good at generating images that are very bright or dark because the scheduler doesn't start sampling from the last timestep and it doesn't enforce a zero signal-to-noise ratio (SNR). The [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://hf.co/papers/2305.08891) paper fixes these issues which are now available in some Diffusers schedulers.
|
||||
|
||||
> [!TIP]
|
||||
> For inference, you need a model that has been trained with *v_prediction*. To train your own model with *v_prediction*, add the following flag to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts.
|
||||
>
|
||||
> ```bash
|
||||
> --prediction_type="v_prediction"
|
||||
> ```
|
||||
|
||||
For example, load the [ptx0/pseudo-journey-v2](https://hf.co/ptx0/pseudo-journey-v2) checkpoint which was trained with `v_prediction` and the [`DDIMScheduler`]. Now you should configure the following parameters in the [`DDIMScheduler`].
|
||||
|
||||
* `rescale_betas_zero_snr=True` to rescale the noise schedule to zero SNR
|
||||
* `timestep_spacing="trailing"` to start sampling from the last timestep
|
||||
|
||||
Set `guidance_rescale` in the pipeline to prevent over-exposure. A lower value increases brightness but some of the details may appear washed out.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
|
||||
|
||||
pipeline.scheduler = DDIMScheduler.from_config(
|
||||
pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
prompt = "cinematic photo of a snowy mountain at night with the northern lights aurora borealis overhead, 35mm photograph, film, professional, 4k, highly detailed"
|
||||
generator = torch.Generator(device="cpu").manual_seed(23)
|
||||
image = pipeline(prompt, guidance_rescale=0.7, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/no-zero-snr.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">default Stable Diffusion v2-1 image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero-snr.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">image with zero SNR and trailing timestep spacing enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Details
|
||||
|
||||
[FreeU](https://hf.co/papers/2309.11497) improves image details by rebalancing the UNet's backbone and skip connection weights. The skip connections can cause the model to overlook some of the backbone semantics which may lead to unnatural image details in the generated image. This technique does not require any additional training and can be applied on the fly during inference for tasks like image-to-image and text-to-video.
|
||||
|
||||
Use the [`~pipelines.StableDiffusionMixin.enable_freeu`] method on your pipeline and configure the scaling factors for the backbone (`b1` and `b2`) and skip connections (`s1` and `s2`). The number after each scaling factor corresponds to the stage in the UNet where the factor is applied. Take a look at the [FreeU](https://github.com/ChenyangSi/FreeU#parameters) repository for reference hyperparameters for different models.
|
||||
|
||||
<hfoptions id="freeu">
|
||||
<hfoption id="Stable Diffusion v1-5">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None
|
||||
).to("cuda")
|
||||
pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.5, b2=1.6)
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
prompt = ""
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv15-no-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv15-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Stable Diffusion v2-1">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, safety_checker=None
|
||||
).to("cuda")
|
||||
pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.4, b2=1.6)
|
||||
generator = torch.Generator(device="cpu").manual_seed(80)
|
||||
prompt = "A squirrel eating a burger"
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv21-no-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv21-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Stable Diffusion XL">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.3, b2=1.4)
|
||||
generator = torch.Generator(device="cpu").manual_seed(13)
|
||||
prompt = "A squirrel eating a burger"
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-no-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-freeu.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Zeroscope">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
# values come from https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines
|
||||
pipeline.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2)
|
||||
prompt = "Confident teddy bear surfer rides the wave in the tropics"
|
||||
generator = torch.Generator(device="cpu").manual_seed(47)
|
||||
video_frames = pipeline(prompt, generator=generator).frames[0]
|
||||
export_to_video(video_frames, "teddy_bear.mp4", fps=10)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/video-no-freeu.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/video-freeu.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Call the [`pipelines.StableDiffusionMixin.disable_freeu`] method to disable FreeU.
|
||||
|
||||
```py
|
||||
pipeline.disable_freeu()
|
||||
```
|
||||
@@ -10,29 +10,30 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
# Latent Consistency Model
|
||||
|
||||
Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings.
|
||||
[[open-in-colab]]
|
||||
|
||||
From the [official website](https://latent-consistency-models.github.io/):
|
||||
[Latent Consistency Models (LCMs)](https://hf.co/papers/2310.04378) enable fast high-quality image generation by directly predicting the reverse diffusion process in the latent rather than pixel space. In other words, LCMs try to predict the noiseless image from the noisy image in contrast to typical diffusion models that iteratively remove noise from the noisy image. By avoiding the iterative sampling process, LCMs are able to generate high-quality images in 2-4 steps instead of 20-30 steps.
|
||||
|
||||
> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations.
|
||||
LCMs are distilled from pretrained models which requires ~32 hours of A100 compute. To speed this up, [LCM-LoRAs](https://hf.co/papers/2311.05556) train a [LoRA adapter](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) which have much fewer parameters to train compared to the full model. The LCM-LoRA can be plugged into a diffusion model once it has been trained.
|
||||
|
||||
For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378).
|
||||
This guide will show you how to use LCMs and LCM-LoRAs for fast inference on tasks and how to use them with other adapters like ControlNet or T2I-Adapter.
|
||||
|
||||
LCM distilled models are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-weights-654ce61a95edd6dffccef6a8).
|
||||
|
||||
This guide shows how to perform inference with LCMs for
|
||||
- text-to-image
|
||||
- image-to-image
|
||||
- combined with style LoRAs
|
||||
- ControlNet/T2I-Adapter
|
||||
> [!TIP]
|
||||
> LCMs and LCM-LoRAs are available for Stable Diffusion v1.5, Stable Diffusion XL, and the SSD-1B model. You can find their checkpoints on the [Latent Consistency](https://hf.co/collections/latent-consistency/latent-consistency-models-weights-654ce61a95edd6dffccef6a8) Collections.
|
||||
|
||||
## Text-to-image
|
||||
|
||||
You'll use the [`StableDiffusionXLPipeline`] pipeline with the [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow, overcoming the slow iterative nature of diffusion models.
|
||||
<hfoptions id="lcm-text2img">
|
||||
<hfoption id="LCM">
|
||||
|
||||
To use LCMs, you need to load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps.
|
||||
|
||||
A couple of notes to keep in mind when using LCMs are:
|
||||
|
||||
* Typically, batch size is doubled inside the pipeline for classifier-free guidance. But LCM applies guidance with guidance embeddings and doesn't need to double the batch size, which leads to faster inference. The downside is that negative prompts don't work with LCM because they don't have any effect on the denoising process.
|
||||
* The ideal range for `guidance_scale` is [3., 13.] because that is what the UNet was trained with. However, disabling `guidance_scale` with a value of 1.0 is also effective in most cases.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler
|
||||
@@ -49,31 +50,69 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2i.png"/>
|
||||
</div>
|
||||
|
||||
Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL.
|
||||
</hfoption>
|
||||
<hfoption id="LCM-LoRA">
|
||||
|
||||
Some details to keep in mind:
|
||||
To use LCM-LoRAs, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps.
|
||||
|
||||
* To perform classifier-free guidance, batch size is usually doubled inside the pipeline. LCM, however, applies guidance using guidance embeddings, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process.
|
||||
* The UNet was trained using the [3., 13.] guidance scale range. So, that is the ideal range for `guidance_scale`. However, disabling `guidance_scale` using a value of 1.0 is also effective in most cases.
|
||||
A couple of notes to keep in mind when using LCM-LoRAs are:
|
||||
|
||||
* Typically, batch size is doubled inside the pipeline for classifier-free guidance. But LCM applies guidance with guidance embeddings and doesn't need to double the batch size, which leads to faster inference. The downside is that negative prompts don't work with LCM because they don't have any effect on the denoising process.
|
||||
* You could use guidance with LCM-LoRAs, but it is very sensitive to high `guidance_scale` values and can lead to artifacts in the generated image. The best values we've found are between [1.0, 2.0].
|
||||
* Replace [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0) with any finetuned model. For example, try using the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) checkpoint to generate anime images with SDXL.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline, LCMScheduler
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
variant="fp16",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
|
||||
|
||||
prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
|
||||
generator = torch.manual_seed(42)
|
||||
image = pipe(
|
||||
prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i.png"/>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Image-to-image
|
||||
|
||||
LCMs can be applied to image-to-image tasks too. For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model, but the same steps can be applied to other LCM models as well.
|
||||
<hfoptions id="lcm-img2img">
|
||||
<hfoption id="LCM">
|
||||
|
||||
To use LCMs for image-to-image, you need to load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps.
|
||||
|
||||
> [!TIP]
|
||||
> Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import AutoPipelineForImage2Image, UNet2DConditionModel, LCMScheduler
|
||||
from diffusers.utils import make_image_grid, load_image
|
||||
from diffusers.utils import load_image
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"SimianLuo/LCM_Dreamshaper_v7",
|
||||
@@ -89,12 +128,8 @@ pipe = AutoPipelineForImage2Image.from_pretrained(
|
||||
).to("cuda")
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# prepare image
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
|
||||
init_image = load_image(url)
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
|
||||
prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
# pass prompt and image to pipeline
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt,
|
||||
@@ -104,22 +139,130 @@ image = pipe(
|
||||
strength=0.5,
|
||||
generator=generator
|
||||
).images[0]
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-img2img.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="LCM-LoRA">
|
||||
|
||||
<Tip>
|
||||
To use LCM-LoRAs for image-to-image, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps.
|
||||
|
||||
You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one.
|
||||
> [!TIP]
|
||||
> Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results.
|
||||
|
||||
</Tip>
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForImage2Image, LCMScheduler
|
||||
from diffusers.utils import make_image_grid, load_image
|
||||
|
||||
pipe = AutoPipelineForImage2Image.from_pretrained(
|
||||
"Lykon/dreamshaper-7",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
|
||||
## Combine with style LoRAs
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
LCMs can be used with other styled LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the [papercut LoRA](TheLastBen/Papercut_SDXL).
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
|
||||
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
|
||||
prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt,
|
||||
image=init_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=1,
|
||||
strength=0.6,
|
||||
generator=generator
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-img2img.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Inpainting
|
||||
|
||||
To use LCM-LoRAs for inpainting, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt, initial image, and mask image to generate an image in just 4 steps.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting, LCMScheduler
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipe = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
|
||||
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
|
||||
|
||||
prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
generator=generator,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=4,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-inpaint.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Adapters
|
||||
|
||||
LCMs are compatible with adapters like LoRA, ControlNet, T2I-Adapter, and AnimateDiff. You can bring the speed of LCMs to these adapters to generate images in a certain style or condition the model on another input like a canny image.
|
||||
|
||||
### LoRA
|
||||
|
||||
[LoRA](../using-diffusers/loading_adapters#lora) adapters can be rapidly finetuned to learn a new style from just a few images and plugged into a pretrained model to generate images in that style.
|
||||
|
||||
<hfoptions id="lcm-lora">
|
||||
<hfoption id="LCM">
|
||||
|
||||
Load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LoRA weights into the LCM and generate a styled image in a few steps.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler
|
||||
@@ -134,11 +277,9 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16, variant="fp16",
|
||||
).to("cuda")
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")
|
||||
|
||||
prompt = "papercut, a cute fox"
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0
|
||||
@@ -146,15 +287,58 @@ image = pipe(
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdx_lora_mix.png"/>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="LCM-LoRA">
|
||||
|
||||
## ControlNet/T2I-Adapter
|
||||
Replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights and the style LoRA you want to use. Combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method and generate a styled image in a few steps.
|
||||
|
||||
Let's look at how we can perform inference with ControlNet/T2I-Adapter and a LCM.
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline, LCMScheduler
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
variant="fp16",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm")
|
||||
pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")
|
||||
|
||||
pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8])
|
||||
|
||||
prompt = "papercut, a cute fox"
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(prompt, num_inference_steps=4, guidance_scale=1, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdx_lora_mix.png"/>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### ControlNet
|
||||
For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model with canny ControlNet, but the same steps can be applied to other LCM models as well.
|
||||
|
||||
[ControlNet](./controlnet) are adapters that can be trained on a variety of inputs like canny edge, pose estimation, or depth. The ControlNet can be inserted into the pipeline to provide additional conditioning and control to the model for more accurate generation.
|
||||
|
||||
You can find additional ControlNet models trained on other inputs in [lllyasviel's](https://hf.co/lllyasviel) repository.
|
||||
|
||||
<hfoptions id="lcm-controlnet">
|
||||
<hfoption id="LCM">
|
||||
|
||||
Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a LCM model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Now pass the canny image to the pipeline and generate an image.
|
||||
|
||||
> [!TIP]
|
||||
> Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results.
|
||||
|
||||
```python
|
||||
import torch
|
||||
@@ -186,8 +370,6 @@ pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
torch_dtype=torch.float16,
|
||||
safety_checker=None,
|
||||
).to("cuda")
|
||||
|
||||
# set scheduler
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
@@ -200,16 +382,84 @@ image = pipe(
|
||||
make_image_grid([canny_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||

|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_controlnet.png"/>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="LCM-LoRA">
|
||||
|
||||
<Tip>
|
||||
The inference parameters in this example might not work for all examples, so we recommend trying different values for the `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale`, and `cross_attention_kwargs` parameters and choosing the best one.
|
||||
</Tip>
|
||||
Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a Stable Diffusion v1.5 model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights, and pass the canny image to the pipeline and generate an image.
|
||||
|
||||
> [!TIP]
|
||||
> Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results.
|
||||
|
||||
```py
|
||||
import torch
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler
|
||||
from diffusers.utils import load_image
|
||||
|
||||
image = load_image(
|
||||
"https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
|
||||
).resize((512, 512))
|
||||
|
||||
image = np.array(image)
|
||||
|
||||
low_threshold = 100
|
||||
high_threshold = 200
|
||||
|
||||
image = cv2.Canny(image, low_threshold, high_threshold)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image)
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
|
||||
pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
controlnet=controlnet,
|
||||
torch_dtype=torch.float16,
|
||||
safety_checker=None,
|
||||
variant="fp16"
|
||||
).to("cuda")
|
||||
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
"the mona lisa",
|
||||
image=canny_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=1.5,
|
||||
controlnet_conditioning_scale=0.8,
|
||||
cross_attention_kwargs={"scale": 1},
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_controlnet.png"/>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### T2I-Adapter
|
||||
|
||||
This example shows how to use the `lcm-sdxl` with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0).
|
||||
[T2I-Adapter](./t2i_adapter) is an even more lightweight adapter than ControlNet, that provides an additional input to condition a pretrained model with. It is faster than ControlNet but the results may be slightly worse.
|
||||
|
||||
You can find additional T2I-Adapter checkpoints trained on other inputs in [TencentArc's](https://hf.co/TencentARC) repository.
|
||||
|
||||
<hfoptions id="lcm-t2i">
|
||||
<hfoption id="LCM">
|
||||
|
||||
Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Then load a LCM checkpoint into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Now pass the canny image to the pipeline and generate an image.
|
||||
|
||||
```python
|
||||
import torch
|
||||
@@ -220,10 +470,9 @@ from PIL import Image
|
||||
from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
# Prepare image
|
||||
# Detect the canny map in low resolution to avoid high-frequency details
|
||||
# detect the canny map in low resolution to avoid high-frequency details
|
||||
image = load_image(
|
||||
"https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg"
|
||||
"https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
|
||||
).resize((384, 384))
|
||||
|
||||
image = np.array(image)
|
||||
@@ -236,7 +485,6 @@ image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image).resize((1024, 1216))
|
||||
|
||||
# load adapter
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
@@ -254,7 +502,7 @@ pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
|
||||
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
prompt = "Mystical fairy in real, magic, 4k picture, high quality"
|
||||
prompt = "the mona lisa, 4k picture, high quality"
|
||||
negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
@@ -268,7 +516,116 @@ image = pipe(
|
||||
adapter_conditioning_factor=1,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
grid = make_image_grid([canny_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||

|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-t2i.png"/>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="LCM-LoRA">
|
||||
|
||||
Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Replace the scheduler with the [`LCMScheduler`], and use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights. Pass the canny image to the pipeline and generate an image.
|
||||
|
||||
```py
|
||||
import torch
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
# detect the canny map in low resolution to avoid high-frequency details
|
||||
image = load_image(
|
||||
"https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
|
||||
).resize((384, 384))
|
||||
|
||||
image = np.array(image)
|
||||
|
||||
low_threshold = 100
|
||||
high_threshold = 200
|
||||
|
||||
image = cv2.Canny(image, low_threshold, high_threshold)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image).resize((1024, 1024))
|
||||
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
|
||||
|
||||
pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
adapter=adapter,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
|
||||
|
||||
prompt = "the mona lisa, 4k picture, high quality"
|
||||
negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
image=canny_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=1.5,
|
||||
adapter_conditioning_scale=0.8,
|
||||
adapter_conditioning_factor=1,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-t2i.png"/>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### AnimateDiff
|
||||
|
||||
[AnimateDiff](../api/pipelines/animatediff) is an adapter that adds motion to an image. It can be used with most Stable Diffusion models, effectively turning them into "video generation" models. Generating good results with a video model usually requires generating multiple frames (16-24), which can be very slow with a regular Stable Diffusion model. LCM-LoRA can speed up this process by only taking 4-8 steps for each frame.
|
||||
|
||||
Load a [`AnimateDiffPipeline`] and pass a [`MotionAdapter`] to it. Then replace the scheduler with the [`LCMScheduler`], and combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method. Now you can pass a prompt to the pipeline and generate an animated image.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, LCMScheduler
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5")
|
||||
pipe = AnimateDiffPipeline.from_pretrained(
|
||||
"frankjoshua/toonyou_beta6",
|
||||
motion_adapter=adapter,
|
||||
).to("cuda")
|
||||
|
||||
# set scheduler
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# load LCM-LoRA
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm")
|
||||
pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
|
||||
|
||||
pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2])
|
||||
|
||||
prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
|
||||
generator = torch.manual_seed(0)
|
||||
frames = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=5,
|
||||
guidance_scale=1.25,
|
||||
cross_attention_kwargs={"scale": 1},
|
||||
num_frames=24,
|
||||
generator=generator
|
||||
).frames[0]
|
||||
export_to_gif(frames, "animation.gif")
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-animatediff.gif"/>
|
||||
</div>
|
||||
|
||||
@@ -1,422 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
# Performing inference with LCM-LoRA
|
||||
|
||||
Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings.
|
||||
|
||||
From the [official website](https://latent-consistency-models.github.io/):
|
||||
|
||||
> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations.
|
||||
|
||||
For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378).
|
||||
|
||||
However, each model needs to be distilled separately for latent consistency distillation. The core idea with LCM-LoRA is to train just a few adapter layers, the adapter being LoRA in this case.
|
||||
This way, we don't have to train the full model and keep the number of trainable parameters manageable. The resulting LoRAs can then be applied to any fine-tuned version of the model without distilling them separately.
|
||||
Additionally, the LoRAs can be applied to image-to-image, ControlNet/T2I-Adapter, inpainting, AnimateDiff etc.
|
||||
The LCM-LoRA can also be combined with other LoRAs to generate styled images in very few steps (4-8).
|
||||
|
||||
LCM-LoRAs are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-loras-654cdd24e111e16f0865fba6).
|
||||
|
||||
For more details about LCM-LoRA, refer to [the technical report](https://huggingface.co/papers/2311.05556).
|
||||
|
||||
This guide shows how to perform inference with LCM-LoRAs for
|
||||
- text-to-image
|
||||
- image-to-image
|
||||
- combined with styled LoRAs
|
||||
- ControlNet/T2I-Adapter
|
||||
- inpainting
|
||||
- AnimateDiff
|
||||
|
||||
Before going through this guide, we'll take a look at the general workflow for performing inference with LCM-LoRAs.
|
||||
LCM-LoRAs are similar to other Stable Diffusion LoRAs so they can be used with any [`DiffusionPipeline`] that supports LoRAs.
|
||||
|
||||
- Load the task specific pipeline and model.
|
||||
- Set the scheduler to [`LCMScheduler`].
|
||||
- Load the LCM-LoRA weights for the model.
|
||||
- Reduce the `guidance_scale` between `[1.0, 2.0]` and set the `num_inference_steps` between [4, 8].
|
||||
- Perform inference with the pipeline with the usual parameters.
|
||||
|
||||
Let's look at how we can perform inference with LCM-LoRAs for different tasks.
|
||||
|
||||
First, make sure you have [peft](https://github.com/huggingface/peft) installed, for better LoRA support.
|
||||
|
||||
```bash
|
||||
pip install -U peft
|
||||
```
|
||||
|
||||
## Text-to-image
|
||||
|
||||
You'll use the [`StableDiffusionXLPipeline`] with the scheduler: [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow overcoming the slow iterative nature of diffusion models.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline, LCMScheduler
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
variant="fp16",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
# set scheduler
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# load LCM-LoRA
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
|
||||
|
||||
prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
|
||||
|
||||
generator = torch.manual_seed(42)
|
||||
image = pipe(
|
||||
prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
|
||||
).images[0]
|
||||
```
|
||||
|
||||

|
||||
|
||||
Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL.
|
||||
|
||||
<Tip>
|
||||
|
||||
You may have noticed that we set `guidance_scale=1.0`, which disables classifer-free-guidance. This is because the LCM-LoRA is trained with guidance, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process.
|
||||
|
||||
You can also use guidance with LCM-LoRA, but due to the nature of training the model is very sensitve to the `guidance_scale` values, high values can lead to artifacts in the generated images. In our experiments, we found that the best values are in the range of [1.0, 2.0].
|
||||
|
||||
</Tip>
|
||||
|
||||
### Inference with a fine-tuned model
|
||||
|
||||
As mentioned above, the LCM-LoRA can be applied to any fine-tuned version of the model without having to distill them separately. Let's look at how we can perform inference with a fine-tuned model. In this example, we'll use the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) model, which is a fine-tuned version of the SDXL model for generating anime.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline, LCMScheduler
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"Linaqruf/animagine-xl",
|
||||
variant="fp16",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
# set scheduler
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# load LCM-LoRA
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
|
||||
|
||||
prompt = "face focus, cute, masterpiece, best quality, 1girl, green hair, sweater, looking at viewer, upper body, beanie, outdoors, night, turtleneck"
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
|
||||
).images[0]
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
## Image-to-image
|
||||
|
||||
LCM-LoRA can be applied to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs. For this example we'll use the [dreamshaper-7](https://huggingface.co/Lykon/dreamshaper-7) model and the LCM-LoRA for `stable-diffusion-v1-5 `.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import AutoPipelineForImage2Image, LCMScheduler
|
||||
from diffusers.utils import make_image_grid, load_image
|
||||
|
||||
pipe = AutoPipelineForImage2Image.from_pretrained(
|
||||
"Lykon/dreamshaper-7",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
|
||||
# set scheduler
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# load LCM-LoRA
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
|
||||
|
||||
# prepare image
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
|
||||
init_image = load_image(url)
|
||||
prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
# pass prompt and image to pipeline
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt,
|
||||
image=init_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=1,
|
||||
strength=0.6,
|
||||
generator=generator
|
||||
).images[0]
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
<Tip>
|
||||
|
||||
You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
## Combine with styled LoRAs
|
||||
|
||||
LCM-LoRA can be combined with other LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the LCM-LoRA with the [papercut LoRA](TheLastBen/Papercut_SDXL).
|
||||
To learn more about how to combine LoRAs, refer to [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#combine-multiple-adapters).
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline, LCMScheduler
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
variant="fp16",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
# set scheduler
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# load LoRAs
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm")
|
||||
pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")
|
||||
|
||||
# Combine LoRAs
|
||||
pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8])
|
||||
|
||||
prompt = "papercut, a cute fox"
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(prompt, num_inference_steps=4, guidance_scale=1, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
## ControlNet/T2I-Adapter
|
||||
|
||||
Let's look at how we can perform inference with ControlNet/T2I-Adapter and LCM-LoRA.
|
||||
|
||||
### ControlNet
|
||||
For this example, we'll use the SD-v1-5 model and the LCM-LoRA for SD-v1-5 with canny ControlNet.
|
||||
|
||||
```python
|
||||
import torch
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler
|
||||
from diffusers.utils import load_image
|
||||
|
||||
image = load_image(
|
||||
"https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
|
||||
).resize((512, 512))
|
||||
|
||||
image = np.array(image)
|
||||
|
||||
low_threshold = 100
|
||||
high_threshold = 200
|
||||
|
||||
image = cv2.Canny(image, low_threshold, high_threshold)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image)
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
|
||||
pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
controlnet=controlnet,
|
||||
torch_dtype=torch.float16,
|
||||
safety_checker=None,
|
||||
variant="fp16"
|
||||
).to("cuda")
|
||||
|
||||
# set scheduler
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# load LCM-LoRA
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
"the mona lisa",
|
||||
image=canny_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=1.5,
|
||||
controlnet_conditioning_scale=0.8,
|
||||
cross_attention_kwargs={"scale": 1},
|
||||
generator=generator,
|
||||
).images[0]
|
||||
make_image_grid([canny_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
<Tip>
|
||||
The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one.
|
||||
</Tip>
|
||||
|
||||
### T2I-Adapter
|
||||
|
||||
This example shows how to use the LCM-LoRA with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0) and SDXL.
|
||||
|
||||
```python
|
||||
import torch
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, LCMScheduler
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
# Prepare image
|
||||
# Detect the canny map in low resolution to avoid high-frequency details
|
||||
image = load_image(
|
||||
"https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg"
|
||||
).resize((384, 384))
|
||||
|
||||
image = np.array(image)
|
||||
|
||||
low_threshold = 100
|
||||
high_threshold = 200
|
||||
|
||||
image = cv2.Canny(image, low_threshold, high_threshold)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image).resize((1024, 1024))
|
||||
|
||||
# load adapter
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
|
||||
|
||||
pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
adapter=adapter,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
|
||||
# set scheduler
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# load LCM-LoRA
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
|
||||
|
||||
prompt = "Mystical fairy in real, magic, 4k picture, high quality"
|
||||
negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
image=canny_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=1.5,
|
||||
adapter_conditioning_scale=0.8,
|
||||
adapter_conditioning_factor=1,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
make_image_grid([canny_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
## Inpainting
|
||||
|
||||
LCM-LoRA can be used for inpainting as well.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting, LCMScheduler
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipe = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
|
||||
# set scheduler
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# load LCM-LoRA
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
|
||||
|
||||
# load base and mask image
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
|
||||
|
||||
# generator = torch.Generator("cuda").manual_seed(92)
|
||||
prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
|
||||
generator = torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
generator=generator,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=4,
|
||||
).images[0]
|
||||
make_image_grid([init_image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
## AnimateDiff
|
||||
|
||||
[`AnimateDiff`] allows you to animate images using Stable Diffusion models. To get good results, we need to generate multiple frames (16-24), and doing this with standard SD models can be very slow.
|
||||
LCM-LoRA can be used to speed up the process significantly, as you just need to do 4-8 steps for each frame. Let's look at how we can perform animation with LCM-LoRA and AnimateDiff.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, LCMScheduler
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("diffusers/animatediff-motion-adapter-v1-5")
|
||||
pipe = AnimateDiffPipeline.from_pretrained(
|
||||
"frankjoshua/toonyou_beta6",
|
||||
motion_adapter=adapter,
|
||||
).to("cuda")
|
||||
|
||||
# set scheduler
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# load LCM-LoRA
|
||||
pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm")
|
||||
pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
|
||||
|
||||
pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2])
|
||||
|
||||
prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
|
||||
generator = torch.manual_seed(0)
|
||||
frames = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=5,
|
||||
guidance_scale=1.25,
|
||||
cross_attention_kwargs={"scale": 1},
|
||||
num_frames=24,
|
||||
generator=generator
|
||||
).frames[0]
|
||||
export_to_gif(frames, "animation.gif")
|
||||
```
|
||||
|
||||

|
||||
438
docs/source/en/using-diffusers/inference_with_tcd_lora.md
Normal file
438
docs/source/en/using-diffusers/inference_with_tcd_lora.md
Normal file
@@ -0,0 +1,438 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
# Trajectory Consistency Distillation-LoRA
|
||||
|
||||
Trajectory Consistency Distillation (TCD) enables a model to generate higher quality and more detailed images with fewer steps. Moreover, owing to the effective error mitigation during the distillation process, TCD demonstrates superior performance even under conditions of large inference steps.
|
||||
|
||||
The major advantages of TCD are:
|
||||
|
||||
- Better than Teacher: TCD demonstrates superior generative quality at both small and large inference steps and exceeds the performance of [DPM-Solver++(2S)](../../api/schedulers/multistep_dpm_solver) with Stable Diffusion XL (SDXL). There is no additional discriminator or LPIPS supervision included during TCD training.
|
||||
|
||||
- Flexible Inference Steps: The inference steps for TCD sampling can be freely adjusted without adversely affecting the image quality.
|
||||
|
||||
- Freely change detail level: During inference, the level of detail in the image can be adjusted with a single hyperparameter, *gamma*.
|
||||
|
||||
> [!TIP]
|
||||
> For more technical details of TCD, please refer to the [paper](https://arxiv.org/abs/2402.19159) or official [project page](https://mhh0318.github.io/tcd/)).
|
||||
|
||||
For large models like SDXL, TCD is trained with [LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) to reduce memory usage. This is also useful because you can reuse LoRAs between different finetuned models, as long as they share the same base model, without further training.
|
||||
|
||||
|
||||
|
||||
This guide will show you how to perform inference with TCD-LoRAs for a variety of tasks like text-to-image and inpainting, as well as how you can easily combine TCD-LoRAs with other adapters. Choose one of the supported base model and it's corresponding TCD-LoRA checkpoint from the table below to get started.
|
||||
|
||||
| Base model | TCD-LoRA checkpoint |
|
||||
|-------------------------------------------------------------------------------------------------|----------------------------------------------------------------|
|
||||
| [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) | [TCD-SD15](https://huggingface.co/h1t/TCD-SD15-LoRA) |
|
||||
| [stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) | [TCD-SD21-base](https://huggingface.co/h1t/TCD-SD21-base-LoRA) |
|
||||
| [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) | [TCD-SDXL](https://huggingface.co/h1t/TCD-SDXL-LoRA) |
|
||||
|
||||
|
||||
Make sure you have [PEFT](https://github.com/huggingface/peft) installed for better LoRA support.
|
||||
|
||||
```bash
|
||||
pip install -U peft
|
||||
```
|
||||
|
||||
## General tasks
|
||||
|
||||
In this guide, let's use the [`StableDiffusionXLPipeline`] and the [`TCDScheduler`]. Use the [`~StableDiffusionPipeline.load_lora_weights`] method to load the SDXL-compatible TCD-LoRA weights.
|
||||
|
||||
A few tips to keep in mind for TCD-LoRA inference are to:
|
||||
|
||||
- Keep the `num_inference_steps` between 4 and 50
|
||||
- Set `eta` (used to control stochasticity at each step) between 0 and 1. You should use a higher `eta` when increasing the number of inference steps, but the downside is that a larger `eta` in [`TCDScheduler`] leads to blurrier images. A value of 0.3 is recommended to produce good results.
|
||||
|
||||
<hfoptions id="tasks">
|
||||
<hfoption id="text-to-image">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline, TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
prompt = "Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna."
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
```
|
||||
|
||||

|
||||
|
||||
</hfoption>
|
||||
|
||||
<hfoption id="inpainting">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting, TCDScheduler
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
pipe = AutoPipelineForInpainting.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
|
||||
mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
|
||||
|
||||
init_image = load_image(img_url).resize((1024, 1024))
|
||||
mask_image = load_image(mask_url).resize((1024, 1024))
|
||||
|
||||
prompt = "a tiger sitting on a park bench"
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
num_inference_steps=8,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
strength=0.99, # make sure to use `strength` below 1.0
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
|
||||
grid_image = make_image_grid([init_image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Community models
|
||||
|
||||
TCD-LoRA also works with many community finetuned models and plugins. For example, load the [animagine-xl-3.0](https://huggingface.co/cagliostrolab/animagine-xl-3.0) checkpoint which is a community finetuned version of SDXL for generating anime images.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline, TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "cagliostrolab/animagine-xl-3.0"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
prompt = "A man, clad in a meticulously tailored military uniform, stands with unwavering resolve. The uniform boasts intricate details, and his eyes gleam with determination. Strands of vibrant, windswept hair peek out from beneath the brim of his cap."
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=8,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
```
|
||||
|
||||

|
||||
|
||||
TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method.
|
||||
|
||||
> [!TIP]
|
||||
> Check out the [Merge LoRAs](merge_loras) guide to learn more about efficient merging methods.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
styled_lora_id = "TheLastBen/Papercut_SDXL"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id, adapter_name="tcd")
|
||||
pipe.load_lora_weights(styled_lora_id, adapter_name="style")
|
||||
pipe.set_adapters(["tcd", "style"], adapter_weights=[1.0, 1.0])
|
||||
|
||||
prompt = "papercut of a winter mountain, snow"
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
## Adapters
|
||||
|
||||
TCD-LoRA is very versatile, and it can be combined with other adapter types like ControlNets, IP-Adapter, and AnimateDiff.
|
||||
|
||||
<hfoptions id="adapters">
|
||||
<hfoption id="ControlNet">
|
||||
|
||||
### Depth ControlNet
|
||||
|
||||
```python
|
||||
import torch
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from transformers import DPTFeatureExtractor, DPTForDepthEstimation
|
||||
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
|
||||
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
|
||||
|
||||
def get_depth_map(image):
|
||||
image = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
|
||||
with torch.no_grad(), torch.autocast(device):
|
||||
depth_map = depth_estimator(image).predicted_depth
|
||||
|
||||
depth_map = torch.nn.functional.interpolate(
|
||||
depth_map.unsqueeze(1),
|
||||
size=(1024, 1024),
|
||||
mode="bicubic",
|
||||
align_corners=False,
|
||||
)
|
||||
depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
|
||||
depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
|
||||
depth_map = (depth_map - depth_min) / (depth_max - depth_min)
|
||||
image = torch.cat([depth_map] * 3, dim=1)
|
||||
|
||||
image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
|
||||
image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
|
||||
return image
|
||||
|
||||
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
controlnet_id = "diffusers/controlnet-depth-sdxl-1.0"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained(
|
||||
controlnet_id,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
base_model_id,
|
||||
controlnet=controlnet,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
prompt = "stormtrooper lecture, photorealistic"
|
||||
|
||||
image = load_image("https://huggingface.co/lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png")
|
||||
depth_image = get_depth_map(image)
|
||||
|
||||
controlnet_conditioning_scale = 0.5 # recommended for good generalization
|
||||
|
||||
image = pipe(
|
||||
prompt,
|
||||
image=depth_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
controlnet_conditioning_scale=controlnet_conditioning_scale,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
|
||||
grid_image = make_image_grid([depth_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||

|
||||
|
||||
### Canny ControlNet
|
||||
```python
|
||||
import torch
|
||||
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
controlnet_id = "diffusers/controlnet-canny-sdxl-1.0"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained(
|
||||
controlnet_id,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
base_model_id,
|
||||
controlnet=controlnet,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
prompt = "ultrarealistic shot of a furry blue bird"
|
||||
|
||||
canny_image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png")
|
||||
|
||||
controlnet_conditioning_scale = 0.5 # recommended for good generalization
|
||||
|
||||
image = pipe(
|
||||
prompt,
|
||||
image=canny_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
controlnet_conditioning_scale=controlnet_conditioning_scale,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
|
||||
grid_image = make_image_grid([canny_image, image], rows=1, cols=2)
|
||||
```
|
||||

|
||||
|
||||
<Tip>
|
||||
The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one.
|
||||
</Tip>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="IP-Adapter">
|
||||
|
||||
This example shows how to use the TCD-LoRA with the [IP-Adapter](https://github.com/tencent-ailab/IP-Adapter/tree/main) and SDXL.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
from ip_adapter import IPAdapterXL
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
image_encoder_path = "sdxl_models/image_encoder"
|
||||
ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
base_model_path,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16"
|
||||
)
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
ip_model = IPAdapterXL(pipe, image_encoder_path, ip_ckpt, device)
|
||||
|
||||
ref_image = load_image("https://raw.githubusercontent.com/tencent-ailab/IP-Adapter/main/assets/images/woman.png").resize((512, 512))
|
||||
|
||||
prompt = "best quality, high quality, wearing sunglasses"
|
||||
|
||||
image = ip_model.generate(
|
||||
pil_image=ref_image,
|
||||
prompt=prompt,
|
||||
scale=0.5,
|
||||
num_samples=1,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
seed=0,
|
||||
)[0]
|
||||
|
||||
grid_image = make_image_grid([ref_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AnimateDiff">
|
||||
|
||||
[`AnimateDiff`] allows animating images using Stable Diffusion models. TCD-LoRA can substantially accelerate the process without degrading image quality. The quality of animation with TCD-LoRA and AnimateDiff has a more lucid outcome.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
|
||||
from scheduling_tcd import TCDScheduler
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5")
|
||||
pipe = AnimateDiffPipeline.from_pretrained(
|
||||
"frankjoshua/toonyou_beta6",
|
||||
motion_adapter=adapter,
|
||||
).to("cuda")
|
||||
|
||||
# set TCDScheduler
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# load TCD LoRA
|
||||
pipe.load_lora_weights("h1t/TCD-SD15-LoRA", adapter_name="tcd")
|
||||
pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
|
||||
|
||||
pipe.set_adapters(["tcd", "motion-lora"], adapter_weights=[1.0, 1.2])
|
||||
|
||||
prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
|
||||
generator = torch.manual_seed(0)
|
||||
frames = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=5,
|
||||
guidance_scale=0,
|
||||
cross_attention_kwargs={"scale": 1},
|
||||
num_frames=24,
|
||||
eta=0.3,
|
||||
generator=generator
|
||||
).frames[0]
|
||||
export_to_gif(frames, "animation.gif")
|
||||
```
|
||||
|
||||

|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
744
docs/source/en/using-diffusers/ip_adapter.md
Normal file
744
docs/source/en/using-diffusers/ip_adapter.md
Normal file
@@ -0,0 +1,744 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# IP-Adapter
|
||||
|
||||
[IP-Adapter](https://hf.co/papers/2308.06721) is an image prompt adapter that can be plugged into diffusion models to enable image prompting without any changes to the underlying model. Furthermore, this adapter can be reused with other models finetuned from the same base model and it can be combined with other adapters like [ControlNet](../using-diffusers/controlnet). The key idea behind IP-Adapter is the *decoupled cross-attention* mechanism which adds a separate cross-attention layer just for image features instead of using the same cross-attention layer for both text and image features. This allows the model to learn more image-specific features.
|
||||
|
||||
> [!TIP]
|
||||
> Learn how to load an IP-Adapter in the [Load adapters](../using-diffusers/loading_adapters#ip-adapter) guide, and make sure you check out the [IP-Adapter Plus](../using-diffusers/loading_adapters#ip-adapter-plus) section which requires manually loading the image encoder.
|
||||
|
||||
This guide will walk you through using IP-Adapter for various tasks and use cases.
|
||||
|
||||
## General tasks
|
||||
|
||||
Let's take a look at how to use IP-Adapter's image prompting capabilities with the [`StableDiffusionXLPipeline`] for tasks like text-to-image, image-to-image, and inpainting. We also encourage you to try out other pipelines such as Stable Diffusion, LCM-LoRA, ControlNet, T2I-Adapter, or AnimateDiff!
|
||||
|
||||
In all the following examples, you'll see the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method. This method controls the amount of text or image conditioning to apply to the model. A value of `1.0` means the model is only conditioned on the image prompt. Lowering this value encourages the model to produce more diverse images, but they may not be as aligned with the image prompt. Typically, a value of `0.5` achieves a good balance between the two prompt types and produces good results.
|
||||
|
||||
> [!TIP]
|
||||
> In the examples below, try adding `low_cpu_mem_usage=True` to the [`~loaders.IPAdapterMixin.load_ip_adapter`] method to speed up the loading time.
|
||||
|
||||
<hfoptions id="tasks">
|
||||
<hfoption id="Text-to-image">
|
||||
|
||||
Crafting the precise text prompt to generate the image you want can be difficult because it may not always capture what you'd like to express. Adding an image alongside the text prompt helps the model better understand what it should generate and can lead to more accurate results.
|
||||
|
||||
Load a Stable Diffusion XL (SDXL) model and insert an IP-Adapter into the model with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method. Use the `subfolder` parameter to load the SDXL model weights.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
|
||||
pipeline.set_ip_adapter_scale(0.6)
|
||||
```
|
||||
|
||||
Create a text prompt and load an image prompt before passing them to the pipeline to generate an image.
|
||||
|
||||
```py
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png")
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
images = pipeline(
|
||||
prompt="a polar bear sitting in a chair drinking a milkshake",
|
||||
ip_adapter_image=image,
|
||||
negative_prompt="deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=100,
|
||||
generator=generator,
|
||||
).images
|
||||
images[0]
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner_2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Image-to-image">
|
||||
|
||||
IP-Adapter can also help with image-to-image by guiding the model to generate an image that resembles the original image and the image prompt.
|
||||
|
||||
Load a Stable Diffusion XL (SDXL) model and insert an IP-Adapter into the model with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method. Use the `subfolder` parameter to load the SDXL model weights.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
|
||||
pipeline.set_ip_adapter_scale(0.6)
|
||||
```
|
||||
|
||||
Pass the original image and the IP-Adapter image prompt to the pipeline to generate an image. Providing a text prompt to the pipeline is optional, but in this example, a text prompt is used to increase image quality.
|
||||
|
||||
```py
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_1.png")
|
||||
ip_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_2.png")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(4)
|
||||
images = pipeline(
|
||||
prompt="best quality, high quality",
|
||||
image=image,
|
||||
ip_adapter_image=ip_image,
|
||||
generator=generator,
|
||||
strength=0.6,
|
||||
).images
|
||||
images[0]
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_1.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_3.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Inpainting">
|
||||
|
||||
IP-Adapter is also useful for inpainting because the image prompt allows you to be much more specific about what you'd like to generate.
|
||||
|
||||
Load a Stable Diffusion XL (SDXL) model and insert an IP-Adapter into the model with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method. Use the `subfolder` parameter to load the SDXL model weights.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
|
||||
pipeline.set_ip_adapter_scale(0.6)
|
||||
```
|
||||
|
||||
Pass a prompt, the original image, mask image, and the IP-Adapter image prompt to the pipeline to generate an image.
|
||||
|
||||
```py
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_mask.png")
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_1.png")
|
||||
ip_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_gummy.png")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(4)
|
||||
images = pipeline(
|
||||
prompt="a cute gummy bear waving",
|
||||
image=image,
|
||||
mask_image=mask_image,
|
||||
ip_adapter_image=ip_image,
|
||||
generator=generator,
|
||||
num_inference_steps=100,
|
||||
).images
|
||||
images[0]
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_1.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_gummy.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_inpaint.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Video">
|
||||
|
||||
IP-Adapter can also help you generate videos that are more aligned with your text prompt. For example, let's load [AnimateDiff](../api/pipelines/animatediff) with its motion adapter and insert an IP-Adapter into the model with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method.
|
||||
|
||||
> [!WARNING]
|
||||
> If you're planning on offloading the model to the CPU, make sure you run it after you've loaded the IP-Adapter. When you call [`~DiffusionPipeline.enable_model_cpu_offload`] before loading the IP-Adapter, it offloads the image encoder module to the CPU and it'll return an error when you try to run the pipeline.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
|
||||
from diffusers.utils import export_to_gif
|
||||
from diffusers.utils import load_image
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
|
||||
pipeline = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
|
||||
scheduler = DDIMScheduler.from_pretrained(
|
||||
"emilianJR/epiCRealism",
|
||||
subfolder="scheduler",
|
||||
clip_sample=False,
|
||||
timestep_spacing="linspace",
|
||||
beta_schedule="linear",
|
||||
steps_offset=1,
|
||||
)
|
||||
pipeline.scheduler = scheduler
|
||||
pipeline.enable_vae_slicing()
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Pass a prompt and an image prompt to the pipeline to generate a short video.
|
||||
|
||||
```py
|
||||
ip_adapter_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_inpaint.png")
|
||||
|
||||
output = pipeline(
|
||||
prompt="A cute gummy bear waving",
|
||||
negative_prompt="bad quality, worse quality, low resolution",
|
||||
ip_adapter_image=ip_adapter_image,
|
||||
num_frames=16,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=50,
|
||||
generator=torch.Generator(device="cpu").manual_seed(0),
|
||||
)
|
||||
frames = output.frames[0]
|
||||
export_to_gif(frames, "gummy_bear.gif")
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_inpaint.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gummy_bear.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Configure parameters
|
||||
|
||||
There are a couple of IP-Adapter parameters that are useful to know about and can help you with your image generation tasks. These parameters can make your workflow more efficient or give you more control over image generation.
|
||||
|
||||
### Image embeddings
|
||||
|
||||
IP-Adapter enabled pipelines provide the `ip_adapter_image_embeds` parameter to accept precomputed image embeddings. This is particularly useful in scenarios where you need to run the IP-Adapter pipeline multiple times because you have more than one image. For example, [multi IP-Adapter](#multi-ip-adapter) is a specific use case where you provide multiple styling images to generate a specific image in a specific style. Loading and encoding multiple images each time you use the pipeline would be inefficient. Instead, you can precompute and save the image embeddings to disk (which can save a lot of space if you're using high-quality images) and load them when you need them.
|
||||
|
||||
> [!TIP]
|
||||
> This parameter also gives you the flexibility to load embeddings from other sources. For example, ComfyUI image embeddings for IP-Adapters are compatible with Diffusers and should work ouf-of-the-box!
|
||||
|
||||
Call the [`~StableDiffusionPipeline.prepare_ip_adapter_image_embeds`] method to encode and generate the image embeddings. Then you can save them to disk with `torch.save`.
|
||||
|
||||
> [!TIP]
|
||||
> If you're using IP-Adapter with `ip_adapter_image_embedding` instead of `ip_adapter_image`', you can set `load_ip_adapter(image_encoder_folder=None,...)` because you don't need to load an encoder to generate the image embeddings.
|
||||
|
||||
```py
|
||||
image_embeds = pipeline.prepare_ip_adapter_image_embeds(
|
||||
ip_adapter_image=image,
|
||||
ip_adapter_image_embeds=None,
|
||||
device="cuda",
|
||||
num_images_per_prompt=1,
|
||||
do_classifier_free_guidance=True,
|
||||
)
|
||||
|
||||
torch.save(image_embeds, "image_embeds.ipadpt")
|
||||
```
|
||||
|
||||
Now load the image embeddings by passing them to the `ip_adapter_image_embeds` parameter.
|
||||
|
||||
```py
|
||||
image_embeds = torch.load("image_embeds.ipadpt")
|
||||
images = pipeline(
|
||||
prompt="a polar bear sitting in a chair drinking a milkshake",
|
||||
ip_adapter_image_embeds=image_embeds,
|
||||
negative_prompt="deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=100,
|
||||
generator=generator,
|
||||
).images
|
||||
```
|
||||
|
||||
### IP-Adapter masking
|
||||
|
||||
Binary masks specify which portion of the output image should be assigned to an IP-Adapter. This is useful for composing more than one IP-Adapter image. For each input IP-Adapter image, you must provide a binary mask.
|
||||
|
||||
To start, preprocess the input IP-Adapter images with the [`~image_processor.IPAdapterMaskProcessor.preprocess()`] to generate their masks. For optimal results, provide the output height and width to [`~image_processor.IPAdapterMaskProcessor.preprocess()`]. This ensures masks with different aspect ratios are appropriately stretched. If the input masks already match the aspect ratio of the generated image, you don't have to set the `height` and `width`.
|
||||
|
||||
```py
|
||||
from diffusers.image_processor import IPAdapterMaskProcessor
|
||||
|
||||
mask1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_mask1.png")
|
||||
mask2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_mask2.png")
|
||||
|
||||
output_height = 1024
|
||||
output_width = 1024
|
||||
|
||||
processor = IPAdapterMaskProcessor()
|
||||
masks = processor.preprocess([mask1, mask2], height=output_height, width=output_width)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask1.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">mask one</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">mask two</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
When there is more than one input IP-Adapter image, load them as a list and provide the IP-Adapter scale list. Each of the input IP-Adapter images here corresponds to one of the masks generated above.
|
||||
|
||||
```py
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"])
|
||||
pipeline.set_ip_adapter_scale([[0.7, 0.7]]) # one scale for each image-mask pair
|
||||
|
||||
face_image1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png")
|
||||
face_image2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl2.png")
|
||||
|
||||
ip_images = [[face_image1, face_image2]]
|
||||
|
||||
masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])]
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl1.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image one</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image two</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Now pass the preprocessed masks to `cross_attention_kwargs` in the pipeline call.
|
||||
|
||||
```py
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
num_images = 1
|
||||
|
||||
image = pipeline(
|
||||
prompt="2 girls",
|
||||
ip_adapter_image=ip_images,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=20,
|
||||
num_images_per_prompt=num_images,
|
||||
generator=generator,
|
||||
cross_attention_kwargs={"ip_adapter_masks": masks}
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_attention_mask_result_seed_0.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter masking applied</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_no_attention_mask_result_seed_0.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">no IP-Adapter masking applied</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Specific use cases
|
||||
|
||||
IP-Adapter's image prompting and compatibility with other adapters and models makes it a versatile tool for a variety of use cases. This section covers some of the more popular applications of IP-Adapter, and we can't wait to see what you come up with!
|
||||
|
||||
### Face model
|
||||
|
||||
Generating accurate faces is challenging because they are complex and nuanced. Diffusers supports two IP-Adapter checkpoints specifically trained to generate faces from the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) repository:
|
||||
|
||||
* [ip-adapter-full-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-full-face_sd15.safetensors) is conditioned with images of cropped faces and removed backgrounds
|
||||
* [ip-adapter-plus-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-plus-face_sd15.safetensors) uses patch embeddings and is conditioned with images of cropped faces
|
||||
|
||||
Additionally, Diffusers supports all IP-Adapter checkpoints trained with face embeddings extracted by `insightface` face models. Supported models are from the [h94/IP-Adapter-FaceID](https://huggingface.co/h94/IP-Adapter-FaceID) repository.
|
||||
|
||||
For face models, use the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) checkpoint. It is also recommended to use [`DDIMScheduler`] or [`EulerDiscreteScheduler`] for face models.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin")
|
||||
|
||||
pipeline.set_ip_adapter_scale(0.5)
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_einstein_base.png")
|
||||
generator = torch.Generator(device="cpu").manual_seed(26)
|
||||
|
||||
image = pipeline(
|
||||
prompt="A photo of Einstein as a chef, wearing an apron, cooking in a French restaurant",
|
||||
ip_adapter_image=image,
|
||||
negative_prompt="lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=100,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_einstein_base.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_einstein.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
To use IP-Adapter FaceID models, first extract face embeddings with `insightface`. Then pass the list of tensors to the pipeline as `ip_adapter_image_embeds`.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
from insightface.app import FaceAnalysis
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid_sd15.bin", image_encoder_folder=None)
|
||||
pipeline.set_ip_adapter_scale(0.6)
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png")
|
||||
|
||||
ref_images_embeds = []
|
||||
app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
|
||||
app.prepare(ctx_id=0, det_size=(640, 640))
|
||||
image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
|
||||
faces = app.get(image)
|
||||
image = torch.from_numpy(faces[0].normed_embedding)
|
||||
ref_images_embeds.append(image.unsqueeze(0))
|
||||
ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
|
||||
neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
|
||||
id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(42)
|
||||
|
||||
images = pipeline(
|
||||
prompt="A photo of a girl",
|
||||
ip_adapter_image_embeds=[id_embeds],
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=20, num_images_per_prompt=1,
|
||||
generator=generator
|
||||
).images
|
||||
```
|
||||
|
||||
Both IP-Adapter FaceID Plus and Plus v2 models require CLIP image embeddings. You can prepare face embeddings as shown previously, then you can extract and pass CLIP embeddings to the hidden image projection layers.
|
||||
|
||||
```py
|
||||
from insightface.utils import face_align
|
||||
|
||||
ref_images_embeds = []
|
||||
ip_adapter_images = []
|
||||
app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
|
||||
app.prepare(ctx_id=0, det_size=(640, 640))
|
||||
image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
|
||||
faces = app.get(image)
|
||||
ip_adapter_images.append(face_align.norm_crop(image, landmark=faces[0].kps, image_size=224))
|
||||
image = torch.from_numpy(faces[0].normed_embedding)
|
||||
ref_images_embeds.append(image.unsqueeze(0))
|
||||
ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
|
||||
neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
|
||||
id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")
|
||||
|
||||
clip_embeds = pipeline.prepare_ip_adapter_image_embeds(
|
||||
[ip_adapter_images], None, torch.device("cuda"), num_images, True)[0]
|
||||
|
||||
pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16)
|
||||
pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = False # True if Plus v2
|
||||
```
|
||||
|
||||
### Multi IP-Adapter
|
||||
|
||||
More than one IP-Adapter can be used at the same time to generate specific images in more diverse styles. For example, you can use IP-Adapter-Face to generate consistent faces and characters, and IP-Adapter Plus to generate those faces in a specific style.
|
||||
|
||||
> [!TIP]
|
||||
> Read the [IP-Adapter Plus](../using-diffusers/loading_adapters#ip-adapter-plus) section to learn why you need to manually load the image encoder.
|
||||
|
||||
Load the image encoder with [`~transformers.CLIPVisionModelWithProjection`].
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image, DDIMScheduler
|
||||
from transformers import CLIPVisionModelWithProjection
|
||||
from diffusers.utils import load_image
|
||||
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
||||
"h94/IP-Adapter",
|
||||
subfolder="models/image_encoder",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
```
|
||||
|
||||
Next, you'll load a base model, scheduler, and the IP-Adapters. The IP-Adapters to use are passed as a list to the `weight_name` parameter:
|
||||
|
||||
* [ip-adapter-plus_sdxl_vit-h](https://huggingface.co/h94/IP-Adapter#ip-adapter-for-sdxl-10) uses patch embeddings and a ViT-H image encoder
|
||||
* [ip-adapter-plus-face_sdxl_vit-h](https://huggingface.co/h94/IP-Adapter#ip-adapter-for-sdxl-10) has the same architecture but it is conditioned with images of cropped faces
|
||||
|
||||
```py
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
image_encoder=image_encoder,
|
||||
)
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.load_ip_adapter(
|
||||
"h94/IP-Adapter",
|
||||
subfolder="sdxl_models",
|
||||
weight_name=["ip-adapter-plus_sdxl_vit-h.safetensors", "ip-adapter-plus-face_sdxl_vit-h.safetensors"]
|
||||
)
|
||||
pipeline.set_ip_adapter_scale([0.7, 0.3])
|
||||
pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Load an image prompt and a folder containing images of a certain style you want to use.
|
||||
|
||||
```py
|
||||
face_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/women_input.png")
|
||||
style_folder = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/style_ziggy"
|
||||
style_images = [load_image(f"{style_folder}/img{i}.png") for i in range(10)]
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/women_input.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image of face</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_style_grid.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter style images</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Pass the image prompt and style images as a list to the `ip_adapter_image` parameter, and run the pipeline!
|
||||
|
||||
```py
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
image = pipeline(
|
||||
prompt="wonderwoman",
|
||||
ip_adapter_image=[style_images, face_image],
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=50, num_images_per_prompt=1,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_multi_out.png" />
|
||||
</div>
|
||||
|
||||
### Instant generation
|
||||
|
||||
[Latent Consistency Models (LCM)](../using-diffusers/inference_with_lcm_lora) are diffusion models that can generate images in as little as 4 steps compared to other diffusion models like SDXL that typically require way more steps. This is why image generation with an LCM feels "instantaneous". IP-Adapters can be plugged into an LCM-LoRA model to instantly generate images with an image prompt.
|
||||
|
||||
The IP-Adapter weights need to be loaded first, then you can use [`~StableDiffusionPipeline.load_lora_weights`] to load the LoRA style and weight you want to apply to your image.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, LCMScheduler
|
||||
import torch
|
||||
from diffusers.utils import load_image
|
||||
|
||||
model_id = "sd-dreambooth-library/herge-style"
|
||||
lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
pipeline.load_lora_weights(lcm_lora_id)
|
||||
pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Try using with a lower IP-Adapter scale to condition image generation more on the [herge_style](https://huggingface.co/sd-dreambooth-library/herge-style) checkpoint, and remember to use the special token `herge_style` in your prompt to trigger and apply the style.
|
||||
|
||||
```py
|
||||
pipeline.set_ip_adapter_scale(0.4)
|
||||
|
||||
prompt = "herge_style woman in armor, best quality, high quality"
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
ip_adapter_image = load_image("https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png")
|
||||
image = pipeline(
|
||||
prompt=prompt,
|
||||
ip_adapter_image=ip_adapter_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=1,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_herge.png" />
|
||||
</div>
|
||||
|
||||
### Structural control
|
||||
|
||||
To control image generation to an even greater degree, you can combine IP-Adapter with a model like [ControlNet](../using-diffusers/controlnet). A ControlNet is also an adapter that can be inserted into a diffusion model to allow for conditioning on an additional control image. The control image can be depth maps, edge maps, pose estimations, and more.
|
||||
|
||||
Load a [`ControlNetModel`] checkpoint conditioned on depth maps, insert it into a diffusion model, and load the IP-Adapter.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
|
||||
import torch
|
||||
from diffusers.utils import load_image
|
||||
|
||||
controlnet_model_path = "lllyasviel/control_v11f1p_sd15_depth"
|
||||
controlnet = ControlNetModel.from_pretrained(controlnet_model_path, torch_dtype=torch.float16)
|
||||
|
||||
pipeline = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16)
|
||||
pipeline.to("cuda")
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
```
|
||||
|
||||
Now load the IP-Adapter image and depth map.
|
||||
|
||||
```py
|
||||
ip_adapter_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png")
|
||||
depth_map = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/depth.png")
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/depth.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">depth map</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Pass the depth map and IP-Adapter image to the pipeline to generate an image.
|
||||
|
||||
```py
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
image = pipeline(
|
||||
prompt="best quality, high quality",
|
||||
image=depth_map,
|
||||
ip_adapter_image=ip_adapter_image,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ipa-controlnet-out.png" />
|
||||
</div>
|
||||
|
||||
### Style & layout control
|
||||
|
||||
[InstantStyle](https://arxiv.org/abs/2404.02733) is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This way, you can generate images following only the style or layout from image prompt, with significantly improved diversity. This is achieved by only activating IP-Adapters to specific parts of the model.
|
||||
|
||||
By default IP-Adapters are inserted to all layers of the model. Use the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method with a dictionary to assign scales to IP-Adapter at different layers.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
|
||||
|
||||
scale = {
|
||||
"down": {"block_2": [0.0, 1.0]},
|
||||
"up": {"block_0": [0.0, 1.0, 0.0]},
|
||||
}
|
||||
pipeline.set_ip_adapter_scale(scale)
|
||||
```
|
||||
|
||||
This will activate IP-Adapter at the second layer in the model's down-part block 2 and up-part block 0. The former is the layer where IP-Adapter injects layout information and the latter injects style. Inserting IP-Adapter to these two layers you can generate images following both the style and layout from image prompt, but with contents more aligned to text prompt.
|
||||
|
||||
```py
|
||||
style_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(26)
|
||||
image = pipeline(
|
||||
prompt="a cat, masterpiece, best quality, high quality",
|
||||
ip_adapter_image=style_image,
|
||||
negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
|
||||
guidance_scale=5,
|
||||
num_inference_steps=30,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
In contrast, inserting IP-Adapter to all layers will often generate images that overly focus on image prompt and diminish diversity.
|
||||
|
||||
Activate IP-Adapter only in the style layer and then call the pipeline again.
|
||||
|
||||
```py
|
||||
scale = {
|
||||
"up": {"block_0": [0.0, 1.0, 0.0]},
|
||||
}
|
||||
pipeline.set_ip_adapter_scale(scale)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(26)
|
||||
image = pipeline(
|
||||
prompt="a cat, masterpiece, best quality, high quality",
|
||||
ip_adapter_image=style_image,
|
||||
negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
|
||||
guidance_scale=5,
|
||||
num_inference_steps=30,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_only.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter only in style layer</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_ip_adapter.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter in all layers</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Note that you don't have to specify all layers in the dictionary. Those not included in the dictionary will be set to scale 0 which means disable IP-Adapter by default.
|
||||
@@ -10,316 +10,397 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Load pipelines, models, and schedulers
|
||||
# Load pipelines
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Having an easy way to use a diffusion system for inference is essential to 🧨 Diffusers. Diffusion systems often consist of multiple components like parameterized models, tokenizers, and schedulers that interact in complex ways. That is why we designed the [`DiffusionPipeline`] to wrap the complexity of the entire diffusion system into an easy-to-use API, while remaining flexible enough to be adapted for other use cases, such as loading each component individually as building blocks to assemble your own diffusion system.
|
||||
|
||||
Everything you need for inference or training is accessible with the `from_pretrained()` method.
|
||||
Diffusion systems consist of multiple components like parameterized models and schedulers that interact in complex ways. That is why we designed the [`DiffusionPipeline`] to wrap the complexity of the entire diffusion system into an easy-to-use API. At the same time, the [`DiffusionPipeline`] is entirely customizable so you can modify each component to build a diffusion system for your use case.
|
||||
|
||||
This guide will show you how to load:
|
||||
|
||||
- pipelines from the Hub and locally
|
||||
- different components into a pipeline
|
||||
- multiple pipelines without increasing memory usage
|
||||
- checkpoint variants such as different floating point types or non-exponential mean averaged (EMA) weights
|
||||
- models and schedulers
|
||||
|
||||
## Diffusion Pipeline
|
||||
## Load a pipeline
|
||||
|
||||
<Tip>
|
||||
> [!TIP]
|
||||
> Skip to the [DiffusionPipeline explained](#diffusionpipeline-explained) section if you're interested in an explanation about how the [`DiffusionPipeline`] class works.
|
||||
|
||||
💡 Skip to the [DiffusionPipeline explained](#diffusionpipeline-explained) section if you are interested in learning in more detail about how the [`DiffusionPipeline`] class works.
|
||||
There are two ways to load a pipeline for a task:
|
||||
|
||||
</Tip>
|
||||
1. Load the generic [`DiffusionPipeline`] class and allow it to automatically detect the correct pipeline class from the checkpoint.
|
||||
2. Load a specific pipeline class for a specific task.
|
||||
|
||||
The [`DiffusionPipeline`] class is the simplest and most generic way to load the latest trending diffusion model from the [Hub](https://huggingface.co/models?library=diffusers&sort=trending). The [`DiffusionPipeline.from_pretrained`] method automatically detects the correct pipeline class from the checkpoint, downloads, and caches all the required configuration and weight files, and returns a pipeline instance ready for inference.
|
||||
<hfoptions id="pipelines">
|
||||
<hfoption id="generic pipeline">
|
||||
|
||||
The [`DiffusionPipeline`] class is a simple and generic way to load the latest trending diffusion model from the [Hub](https://huggingface.co/models?library=diffusers&sort=trending). It uses the [`~DiffusionPipeline.from_pretrained`] method to automatically detect the correct pipeline class for a task from the checkpoint, downloads and caches all the required configuration and weight files, and returns a pipeline ready for inference.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
pipe = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
|
||||
pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
You can also load a checkpoint with its specific pipeline class. The example above loaded a Stable Diffusion model; to get the same result, use the [`StableDiffusionPipeline`] class:
|
||||
This same checkpoint can also be used for an image-to-image task. The [`DiffusionPipeline`] class can handle any task as long as you provide the appropriate inputs. For example, for an image-to-image task, you need to pass an initial image to the pipeline.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
|
||||
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", image=init_image).images[0]
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="specific pipeline">
|
||||
|
||||
Checkpoints can be loaded by their specific pipeline class if you already know it. For example, to load a Stable Diffusion model, use the [`StableDiffusionPipeline`] class.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
A checkpoint (such as [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) or [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) may also be used for more than one task, like text-to-image or image-to-image. To differentiate what task you want to use the checkpoint for, you have to load it directly with its corresponding task-specific pipeline class:
|
||||
This same checkpoint may also be used for another task like image-to-image. To differentiate what task you want to use the checkpoint for, you have to use the corresponding task-specific pipeline class. For example, to use the same checkpoint for image-to-image, use the [`StableDiffusionImg2ImgPipeline`] class.
|
||||
|
||||
```python
|
||||
```py
|
||||
from diffusers import StableDiffusionImg2ImgPipeline
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(repo_id)
|
||||
pipeline = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Use the Space below to gauge a pipeline's memory requirements before you download and load it to see if it runs on your hardware.
|
||||
|
||||
<div class="block dark:hidden">
|
||||
<iframe
|
||||
src="https://diffusers-compute-pipeline-size.hf.space?__theme=light"
|
||||
width="850"
|
||||
height="1600"
|
||||
></iframe>
|
||||
</div>
|
||||
<div class="hidden dark:block">
|
||||
<iframe
|
||||
src="https://diffusers-compute-pipeline-size.hf.space?__theme=dark"
|
||||
width="850"
|
||||
height="1600"
|
||||
></iframe>
|
||||
</div>
|
||||
|
||||
### Local pipeline
|
||||
|
||||
To load a diffusion pipeline locally, use [`git-lfs`](https://git-lfs.github.com/) to manually download the checkpoint (in this case, [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) to your local disk. This creates a local folder, `./stable-diffusion-v1-5`, on your disk:
|
||||
To load a pipeline locally, use [git-lfs](https://git-lfs.github.com/) to manually download a checkpoint to your local disk.
|
||||
|
||||
```bash
|
||||
git-lfs install
|
||||
git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
|
||||
```
|
||||
|
||||
Then pass the local path to [`~DiffusionPipeline.from_pretrained`]:
|
||||
This creates a local folder, ./stable-diffusion-v1-5, on your disk and you should pass its path to [`~DiffusionPipeline.from_pretrained`].
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
repo_id = "./stable-diffusion-v1-5"
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
The [`~DiffusionPipeline.from_pretrained`] method won't download any files from the Hub when it detects a local path, but this also means it won't download and cache the latest changes to a checkpoint.
|
||||
The [`~DiffusionPipeline.from_pretrained`] method won't download files from the Hub when it detects a local path, but this also means it won't download and cache the latest changes to a checkpoint.
|
||||
|
||||
### Swap components in a pipeline
|
||||
## Customize a pipeline
|
||||
|
||||
You can customize the default components of any pipeline with another compatible component. Customization is important because:
|
||||
You can customize a pipeline by loading different components into it. This is important because you can:
|
||||
|
||||
- Changing the scheduler is important for exploring the trade-off between generation speed and quality.
|
||||
- Different components of a model are typically trained independently and you can swap out a component with a better-performing one.
|
||||
- During finetuning, usually only some components - like the UNet or text encoder - are trained.
|
||||
- change to a scheduler with faster generation speed or higher generation quality depending on your needs (call the `scheduler.compatibles` method on your pipeline to see compatible schedulers)
|
||||
- change a default pipeline component to a newer and better performing one
|
||||
|
||||
To find out which schedulers are compatible for customization, you can use the `compatibles` method:
|
||||
For example, let's customize the default [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0) checkpoint with:
|
||||
|
||||
- The [`HeunDiscreteScheduler`] to generate higher quality images at the expense of slower generation speed. You must pass the `subfolder="scheduler"` parameter in [`~HeunDiscreteScheduler.from_pretrained`] to load the scheduler configuration into the correct [subfolder](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0/tree/main/scheduler) of the pipeline repository.
|
||||
- A more stable VAE that runs in fp16.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers import StableDiffusionXLPipeline, HeunDiscreteScheduler, AutoencoderKL
|
||||
import torch
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
|
||||
stable_diffusion.scheduler.compatibles
|
||||
scheduler = HeunDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
|
||||
```
|
||||
|
||||
Let's use the [`SchedulerMixin.from_pretrained`] method to replace the default [`PNDMScheduler`] with a more performant scheduler, [`EulerDiscreteScheduler`]. The `subfolder="scheduler"` argument is required to load the scheduler configuration from the correct [subfolder](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/scheduler) of the pipeline repository.
|
||||
Now pass the new scheduler and VAE to the [`StableDiffusionXLPipeline`].
|
||||
|
||||
Then you can pass the new [`EulerDiscreteScheduler`] instance to the `scheduler` argument in [`DiffusionPipeline`]:
|
||||
```py
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
scheduler=scheduler,
|
||||
vae=vae,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
## Reuse a pipeline
|
||||
|
||||
When you load multiple pipelines that share the same model components, it makes sense to reuse the shared components instead of reloading everything into memory again, especially if your hardware is memory-constrained. For example:
|
||||
|
||||
1. You generated an image with the [`StableDiffusionPipeline`] but you want to improve its quality with the [`StableDiffusionSAGPipeline`]. Both of these pipelines share the same pretrained model, so it'd be a waste of memory to load the same model twice.
|
||||
2. You want to add a model component, like a [`MotionAdapter`](../api/pipelines/animatediff#animatediffpipeline), to [`AnimateDiffPipeline`] which was instantiated from an existing [`StableDiffusionPipeline`]. Again, both pipelines share the same pretrained model, so it'd be a waste of memory to load an entirely new pipeline again.
|
||||
|
||||
With the [`DiffusionPipeline.from_pipe`] API, you can switch between multiple pipelines to take advantage of their different features without increasing memory-usage. It is similar to turning on and off a feature in your pipeline.
|
||||
|
||||
> [!TIP]
|
||||
> To switch between tasks (rather than features), use the [`~DiffusionPipeline.from_pipe`] method with the [AutoPipeline](../api/pipelines/auto_pipeline) class, which automatically identifies the pipeline class based on the task (learn more in the [AutoPipeline](../tutorials/autopipeline) tutorial).
|
||||
|
||||
Let's start with a [`StableDiffusionPipeline`] and then reuse the loaded model components to create a [`StableDiffusionSAGPipeline`] to increase generation quality. You'll use the [`StableDiffusionPipeline`] with an [IP-Adapter](./ip_adapter) to generate a bear eating pizza.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline, EulerDiscreteScheduler
|
||||
from diffusers import DiffusionPipeline, StableDiffusionSAGPipeline
|
||||
import torch
|
||||
import gc
|
||||
from diffusers.utils import load_image
|
||||
from accelerate.utils import compute_module_sizes
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
scheduler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, scheduler=scheduler, use_safetensors=True)
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png")
|
||||
|
||||
pipe_sd = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", torch_dtype=torch.float16)
|
||||
pipe_sd.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
pipe_sd.set_ip_adapter_scale(0.6)
|
||||
pipe_sd.to("cuda")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
out_sd = pipe_sd(
|
||||
prompt="bear eats pizza",
|
||||
negative_prompt="wrong white balance, dark, sketches,worst quality,low quality",
|
||||
ip_adapter_image=image,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
out_sd
|
||||
```
|
||||
|
||||
### Safety checker
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_sd_0.png"/>
|
||||
</div>
|
||||
|
||||
Diffusion models like Stable Diffusion can generate harmful content, which is why 🧨 Diffusers has a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) to check generated outputs against known hardcoded NSFW content. If you'd like to disable the safety checker for whatever reason, pass `None` to the `safety_checker` argument:
|
||||
For reference, you can check how much memory this process consumed.
|
||||
|
||||
```python
|
||||
def bytes_to_giga_bytes(bytes):
|
||||
return bytes / 1024 / 1024 / 1024
|
||||
print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
|
||||
"Max memory allocated: 4.406213283538818 GB"
|
||||
```
|
||||
|
||||
Now, reuse the same pipeline components from [`StableDiffusionPipeline`] in [`StableDiffusionSAGPipeline`] with the [`~DiffusionPipeline.from_pipe`] method.
|
||||
|
||||
> [!WARNING]
|
||||
> Some pipeline methods may not function properly on new pipelines created with [`~DiffusionPipeline.from_pipe`]. For instance, the [`~DiffusionPipeline.enable_model_cpu_offload`] method installs hooks on the model components based on a unique offloading sequence for each pipeline. If the models are executed in a different order in the new pipeline, the CPU offloading may not work correctly.
|
||||
>
|
||||
> To ensure everything works as expected, we recommend re-applying a pipeline method on a new pipeline created with [`~DiffusionPipeline.from_pipe`].
|
||||
|
||||
```python
|
||||
pipe_sag = StableDiffusionSAGPipeline.from_pipe(
|
||||
pipe_sd
|
||||
)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
out_sag = pipe_sag(
|
||||
prompt="bear eats pizza",
|
||||
negative_prompt="wrong white balance, dark, sketches,worst quality,low quality",
|
||||
ip_adapter_image=image,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
guidance_scale=1.0,
|
||||
sag_scale=0.75
|
||||
).images[0]
|
||||
out_sag
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_sag_1.png"/>
|
||||
</div>
|
||||
|
||||
If you check the memory usage, you'll see it remains the same as before because [`StableDiffusionPipeline`] and [`StableDiffusionSAGPipeline`] are sharing the same pipeline components. This allows you to use them interchangeably without any additional memory overhead.
|
||||
|
||||
```py
|
||||
print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
|
||||
"Max memory allocated: 4.406213283538818 GB"
|
||||
```
|
||||
|
||||
Let's animate the image with the [`AnimateDiffPipeline`] and also add a [`MotionAdapter`] module to the pipeline. For the [`AnimateDiffPipeline`], you need to unload the IP-Adapter first and reload it *after* you've created your new pipeline (this only applies to the [`AnimateDiffPipeline`]).
|
||||
|
||||
```py
|
||||
from diffusers import AnimateDiffPipeline, MotionAdapter, DDIMScheduler
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
pipe_sag.unload_ip_adapter()
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
|
||||
|
||||
pipe_animate = AnimateDiffPipeline.from_pipe(pipe_sd, motion_adapter=adapter)
|
||||
pipe_animate.scheduler = DDIMScheduler.from_config(pipe_animate.scheduler.config, beta_schedule="linear")
|
||||
# load IP-Adapter and LoRA weights again
|
||||
pipe_animate.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
pipe_animate.load_lora_weights("guoyww/animatediff-motion-lora-zoom-out", adapter_name="zoom-out")
|
||||
pipe_animate.to("cuda")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
pipe_animate.set_adapters("zoom-out", adapter_weights=0.75)
|
||||
out = pipe_animate(
|
||||
prompt="bear eats pizza",
|
||||
num_frames=16,
|
||||
num_inference_steps=50,
|
||||
ip_adapter_image=image,
|
||||
generator=generator,
|
||||
).frames[0]
|
||||
export_to_gif(out, "out_animate.gif")
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_animate_3.gif"/>
|
||||
</div>
|
||||
|
||||
The [`AnimateDiffPipeline`] is more memory-intensive and consumes 15GB of memory (see the [Memory-usage of from_pipe](#memory-usage-of-from_pipe) section to learn what this means for your memory-usage).
|
||||
|
||||
```py
|
||||
print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
|
||||
"Max memory allocated: 15.178664207458496 GB"
|
||||
```
|
||||
|
||||
### Modify from_pipe components
|
||||
|
||||
Pipelines loaded with [`~DiffusionPipeline.from_pipe`] can be customized with different model components or methods. However, whenever you modify the *state* of the model components, it affects all the other pipelines that share the same components. For example, if you call [`~diffusers.loaders.IPAdapterMixin.unload_ip_adapter`] on the [`StableDiffusionSAGPipeline`], you won't be able to use IP-Adapter with the [`StableDiffusionPipeline`] because it's been removed from their shared components.
|
||||
|
||||
```py
|
||||
pipe.sag_unload_ip_adapter()
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
out_sd = pipe_sd(
|
||||
prompt="bear eats pizza",
|
||||
negative_prompt="wrong white balance, dark, sketches,worst quality,low quality",
|
||||
ip_adapter_image=image,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
"AttributeError: 'NoneType' object has no attribute 'image_projection_layers'"
|
||||
```
|
||||
|
||||
### Memory usage of from_pipe
|
||||
|
||||
The memory requirement of loading multiple pipelines with [`~DiffusionPipeline.from_pipe`] is determined by the pipeline with the highest memory-usage regardless of the number of pipelines you create.
|
||||
|
||||
| Pipeline | Memory usage (GB) |
|
||||
|---|---|
|
||||
| StableDiffusionPipeline | 4.400 |
|
||||
| StableDiffusionSAGPipeline | 4.400 |
|
||||
| AnimateDiffPipeline | 15.178 |
|
||||
|
||||
The [`AnimateDiffPipeline`] has the highest memory requirement, so the *total memory-usage* is based only on the [`AnimateDiffPipeline`]. Your memory-usage will not increase if you create additional pipelines as long as their memory requirements doesn't exceed that of the [`AnimateDiffPipeline`]. Each pipeline can be used interchangeably without any additional memory overhead.
|
||||
|
||||
## Safety checker
|
||||
|
||||
Diffusers implements a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) for Stable Diffusion models which can generate harmful content. The safety checker screens the generated output against known hardcoded not-safe-for-work (NSFW) content. If for whatever reason you'd like to disable the safety checker, pass `safety_checker=None` to the [`~DiffusionPipeline.from_pretrained`] method.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, safety_checker=None, use_safetensors=True)
|
||||
pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None, use_safetensors=True)
|
||||
"""
|
||||
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide by the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend keeping the safety filter enabled in all public-facing circumstances, disabling it only for use cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .
|
||||
"""
|
||||
```
|
||||
|
||||
### Reuse components across pipelines
|
||||
|
||||
You can also reuse the same components in multiple pipelines to avoid loading the weights into RAM twice. Use the [`~DiffusionPipeline.components`] method to save the components:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
|
||||
|
||||
model_id = "runwayml/stable-diffusion-v1-5"
|
||||
stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
|
||||
|
||||
components = stable_diffusion_txt2img.components
|
||||
```
|
||||
|
||||
Then you can pass the `components` to another pipeline without reloading the weights into RAM:
|
||||
|
||||
```py
|
||||
stable_diffusion_img2img = StableDiffusionImg2ImgPipeline(**components)
|
||||
```
|
||||
|
||||
You can also pass the components individually to the pipeline if you want more flexibility over which components to reuse or disable. For example, to reuse the same components in the text-to-image pipeline, except for the safety checker and feature extractor, in the image-to-image pipeline:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
|
||||
|
||||
model_id = "runwayml/stable-diffusion-v1-5"
|
||||
stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
|
||||
stable_diffusion_img2img = StableDiffusionImg2ImgPipeline(
|
||||
vae=stable_diffusion_txt2img.vae,
|
||||
text_encoder=stable_diffusion_txt2img.text_encoder,
|
||||
tokenizer=stable_diffusion_txt2img.tokenizer,
|
||||
unet=stable_diffusion_txt2img.unet,
|
||||
scheduler=stable_diffusion_txt2img.scheduler,
|
||||
safety_checker=None,
|
||||
feature_extractor=None,
|
||||
requires_safety_checker=False,
|
||||
)
|
||||
```
|
||||
|
||||
## Checkpoint variants
|
||||
|
||||
A checkpoint variant is usually a checkpoint whose weights are:
|
||||
|
||||
- Stored in a different floating point type for lower precision and lower storage, such as [`torch.float16`](https://pytorch.org/docs/stable/tensors.html#data-types), because it only requires half the bandwidth and storage to download. You can't use this variant if you're continuing training or using a CPU.
|
||||
- Non-exponential mean averaged (EMA) weights, which shouldn't be used for inference. You should use these to continue fine-tuning a model.
|
||||
- Stored in a different floating point type, such as [torch.float16](https://pytorch.org/docs/stable/tensors.html#data-types), because it only requires half the bandwidth and storage to download. You can't use this variant if you're continuing training or using a CPU.
|
||||
- Non-exponential mean averaged (EMA) weights which shouldn't be used for inference. You should use this variant to continue finetuning a model.
|
||||
|
||||
<Tip>
|
||||
> [!TIP]
|
||||
> When the checkpoints have identical model structures, but they were trained on different datasets and with a different training setup, they should be stored in separate repositories. For example, [stabilityai/stable-diffusion-2](https://hf.co/stabilityai/stable-diffusion-2) and [stabilityai/stable-diffusion-2-1](https://hf.co/stabilityai/stable-diffusion-2-1) are stored in separate repositories.
|
||||
|
||||
💡 When the checkpoints have identical model structures, but they were trained on different datasets and with a different training setup, they should be stored in separate repositories instead of variations (for example, [`stable-diffusion-v1-4`] and [`stable-diffusion-v1-5`]).
|
||||
Otherwise, a variant is **identical** to the original checkpoint. They have exactly the same serialization format (like [safetensors](./using_safetensors)), model structure, and their weights have identical tensor shapes.
|
||||
|
||||
</Tip>
|
||||
| **checkpoint type** | **weight name** | **argument for loading weights** |
|
||||
|---------------------|---------------------------------------------|----------------------------------|
|
||||
| original | diffusion_pytorch_model.safetensors | |
|
||||
| floating point | diffusion_pytorch_model.fp16.safetensors | `variant`, `torch_dtype` |
|
||||
| non-EMA | diffusion_pytorch_model.non_ema.safetensors | `variant` |
|
||||
|
||||
Otherwise, a variant is **identical** to the original checkpoint. They have exactly the same serialization format (like [Safetensors](./using_safetensors)), model structure, and weights that have identical tensor shapes.
|
||||
There are two important arguments for loading variants:
|
||||
|
||||
| **checkpoint type** | **weight name** | **argument for loading weights** |
|
||||
|---------------------|-------------------------------------|----------------------------------|
|
||||
| original | diffusion_pytorch_model.bin | |
|
||||
| floating point | diffusion_pytorch_model.fp16.bin | `variant`, `torch_dtype` |
|
||||
| non-EMA | diffusion_pytorch_model.non_ema.bin | `variant` |
|
||||
- `torch_dtype` specifies the floating point precision of the loaded checkpoint. For example, if you want to save bandwidth by loading a fp16 variant, you should set `variant="fp16"` and `torch_dtype=torch.float16` to *convert the weights* to fp16. Otherwise, the fp16 weights are converted to the default fp32 precision.
|
||||
|
||||
There are two important arguments to know for loading variants:
|
||||
If you only set `torch_dtype=torch.float16`, the default fp32 weights are downloaded first and then converted to fp16.
|
||||
|
||||
- `torch_dtype` defines the floating point precision of the loaded checkpoints. For example, if you want to save bandwidth by loading a `fp16` variant, you should specify `torch_dtype=torch.float16` to *convert the weights* to `fp16`. Otherwise, the `fp16` weights are converted to the default `fp32` precision. You can also load the original checkpoint without defining the `variant` argument, and convert it to `fp16` with `torch_dtype=torch.float16`. In this case, the default `fp32` weights are downloaded first, and then they're converted to `fp16` after loading.
|
||||
- `variant` specifies which files should be loaded from the repository. For example, if you want to load a non-EMA variant of a UNet from [runwayml/stable-diffusion-v1-5](https://hf.co/runwayml/stable-diffusion-v1-5/tree/main/unet), set `variant="non_ema"` to download the `non_ema` file.
|
||||
|
||||
- `variant` defines which files should be loaded from the repository. For example, if you want to load a `non_ema` variant from the [`diffusers/stable-diffusion-variants`](https://huggingface.co/diffusers/stable-diffusion-variants/tree/main/unet) repository, you should specify `variant="non_ema"` to download the `non_ema` files.
|
||||
<hfoptions id="variants">
|
||||
<hfoption id="fp16">
|
||||
|
||||
```python
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
# load fp16 variant
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
# load non_ema variant
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="non-EMA">
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", variant="non_ema", use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
To save a checkpoint stored in a different floating-point type or as a non-EMA variant, use the [`DiffusionPipeline.save_pretrained`] method and specify the `variant` argument. You should try and save a variant to the same folder as the original checkpoint, so you can load both from the same folder:
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Use the `variant` parameter in the [`DiffusionPipeline.save_pretrained`] method to save a checkpoint as a different floating point type or as a non-EMA variant. You should try save a variant to the same folder as the original checkpoint, so you have the option of loading both from the same folder.
|
||||
|
||||
<hfoptions id="save">
|
||||
<hfoption id="fp16">
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
# save as fp16 variant
|
||||
stable_diffusion.save_pretrained("runwayml/stable-diffusion-v1-5", variant="fp16")
|
||||
# save as non-ema variant
|
||||
stable_diffusion.save_pretrained("runwayml/stable-diffusion-v1-5", variant="non_ema")
|
||||
pipeline.save_pretrained("runwayml/stable-diffusion-v1-5", variant="fp16")
|
||||
```
|
||||
|
||||
If you don't save the variant to an existing folder, you must specify the `variant` argument otherwise it'll throw an `Exception` because it can't find the original checkpoint:
|
||||
</hfoption>
|
||||
<hfoption id="non_ema">
|
||||
|
||||
```py
|
||||
pipeline.save_pretrained("runwayml/stable-diffusion-v1-5", variant="non_ema")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
If you don't save the variant to an existing folder, you must specify the `variant` argument otherwise it'll throw an `Exception` because it can't find the original checkpoint.
|
||||
|
||||
```python
|
||||
# 👎 this won't work
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"./stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
# 👍 this works
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained(
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"./stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
<!--
|
||||
TODO(Patrick) - Make sure to uncomment this part as soon as things are deprecated.
|
||||
|
||||
#### Using `revision` to load pipeline variants is deprecated
|
||||
|
||||
Previously the `revision` argument of [`DiffusionPipeline.from_pretrained`] was heavily used to
|
||||
load model variants, e.g.:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", use_safetensors=True)
|
||||
```
|
||||
|
||||
However, this behavior is now deprecated since the "revision" argument should (just as it's done in GitHub) better be used to load model checkpoints from a specific commit or branch in development.
|
||||
|
||||
The above example is therefore deprecated and won't be supported anymore for `diffusers >= 1.0.0`.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
If you load diffusers pipelines or models with `revision="fp16"` or `revision="non_ema"`,
|
||||
please make sure to update the code and use `variant="fp16"` or `variation="non_ema"` respectively
|
||||
instead.
|
||||
|
||||
</Tip>
|
||||
-->
|
||||
|
||||
## Models
|
||||
|
||||
Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them.
|
||||
|
||||
Models can be loaded from a subfolder with the `subfolder` argument. For example, the model weights for `runwayml/stable-diffusion-v1-5` are stored in the [`unet`](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/unet) subfolder:
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DConditionModel
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
model = UNet2DConditionModel.from_pretrained(repo_id, subfolder="unet", use_safetensors=True)
|
||||
```
|
||||
|
||||
Or directly from a repository's [directory](https://huggingface.co/google/ddpm-cifar10-32/tree/main):
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DModel
|
||||
|
||||
repo_id = "google/ddpm-cifar10-32"
|
||||
model = UNet2DModel.from_pretrained(repo_id, use_safetensors=True)
|
||||
```
|
||||
|
||||
You can also load and save model variants by specifying the `variant` argument in [`ModelMixin.from_pretrained`] and [`ModelMixin.save_pretrained`]:
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DConditionModel
|
||||
|
||||
model = UNet2DConditionModel.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", subfolder="unet", variant="non_ema", use_safetensors=True
|
||||
)
|
||||
model.save_pretrained("./local-unet", variant="non_ema")
|
||||
```
|
||||
|
||||
## Schedulers
|
||||
|
||||
Schedulers are loaded from the [`SchedulerMixin.from_pretrained`] method, and unlike models, schedulers are **not parameterized** or **trained**; they are defined by a configuration file.
|
||||
|
||||
Loading schedulers does not consume any significant amount of memory and the same configuration file can be used for a variety of different schedulers.
|
||||
For example, the following schedulers are compatible with [`StableDiffusionPipeline`], which means you can load the same scheduler configuration file in any of these classes:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from diffusers import (
|
||||
DDPMScheduler,
|
||||
DDIMScheduler,
|
||||
PNDMScheduler,
|
||||
LMSDiscreteScheduler,
|
||||
EulerAncestralDiscreteScheduler,
|
||||
EulerDiscreteScheduler,
|
||||
DPMSolverMultistepScheduler,
|
||||
)
|
||||
|
||||
repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
|
||||
ddpm = DDPMScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
ddim = DDIMScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
pndm = PNDMScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
lms = LMSDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
euler_anc = EulerAncestralDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
euler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
dpm = DPMSolverMultistepScheduler.from_pretrained(repo_id, subfolder="scheduler")
|
||||
|
||||
# replace `dpm` with any of `ddpm`, `ddim`, `pndm`, `lms`, `euler_anc`, `euler`
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(repo_id, scheduler=dpm, use_safetensors=True)
|
||||
```
|
||||
|
||||
## DiffusionPipeline explained
|
||||
|
||||
As a class method, [`DiffusionPipeline.from_pretrained`] is responsible for two things:
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user