mirror of
https://github.com/huggingface/diffusers.git
synced 2025-12-07 04:54:47 +08:00
Compare commits
116 Commits
fix-single
...
fix-mirror
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bae4e5f8af | ||
|
|
7f51f286a5 | ||
|
|
829f6defa4 | ||
|
|
24bdf4b215 | ||
|
|
95e0c3757d | ||
|
|
6cf0be5d3d | ||
|
|
ec068f9b5b | ||
|
|
0240d4191a | ||
|
|
04717fd861 | ||
|
|
6fd458e99d | ||
|
|
1066fe4cbc | ||
|
|
d38f69ea25 | ||
|
|
0a1c13af79 | ||
|
|
0028c34432 | ||
|
|
d457beed92 | ||
|
|
1d9a6a81b9 | ||
|
|
4e0984db6c | ||
|
|
83bc6c94ea | ||
|
|
0d68ddf327 | ||
|
|
7d887118b9 | ||
|
|
b63c956860 | ||
|
|
716b2062bf | ||
|
|
5fd6825d25 | ||
|
|
e0fae6fd73 | ||
|
|
ec1aded12e | ||
|
|
151a56b80e | ||
|
|
a3faf3f260 | ||
|
|
867a2b0cf9 | ||
|
|
98730c5dd7 | ||
|
|
7ebd359446 | ||
|
|
d3881f35b7 | ||
|
|
48207d6689 | ||
|
|
2f6f426f66 | ||
|
|
a0542c1917 | ||
|
|
a8ad6664c2 | ||
|
|
14f7b545bd | ||
|
|
07cd20041c | ||
|
|
6ddbf6222c | ||
|
|
3ff39e8e86 | ||
|
|
6be43bd855 | ||
|
|
dc89434bdc | ||
|
|
4d633bfe9a | ||
|
|
174cf868ea | ||
|
|
413604405f | ||
|
|
bc108e1533 | ||
|
|
86555c9f59 | ||
|
|
983dec3bf7 | ||
|
|
f9fa8a868c | ||
|
|
05be622b1c | ||
|
|
352d96eb82 | ||
|
|
3511a9623f | ||
|
|
42cae93b94 | ||
|
|
a2ecce26bc | ||
|
|
9e00b727ad | ||
|
|
f7a4626f4b | ||
|
|
f4a44b7707 | ||
|
|
3bc3b48c10 | ||
|
|
581d8aacf7 | ||
|
|
ba1bfac20b | ||
|
|
5edd0b34fa | ||
|
|
3a28e36aa1 | ||
|
|
3393c01c9d | ||
|
|
1fa8dbc63a | ||
|
|
0ab6dc0f23 | ||
|
|
b2030a249c | ||
|
|
67bef2027c | ||
|
|
aa676c641f | ||
|
|
e6df8edadc | ||
|
|
80cfaebaa1 | ||
|
|
ba82414106 | ||
|
|
fe5f035f79 | ||
|
|
b3d10d6d65 | ||
|
|
b82f9f5666 | ||
|
|
6a5ba1b719 | ||
|
|
4d40c9140c | ||
|
|
0ab63ff647 | ||
|
|
db33af065b | ||
|
|
1096f88e2b | ||
|
|
cef4a51223 | ||
|
|
edf5ba6a17 | ||
|
|
9941f1f61b | ||
|
|
46a9db0336 | ||
|
|
370146e4e0 | ||
|
|
5cd45c24bf | ||
|
|
67b3fe0aae | ||
|
|
baab065679 | ||
|
|
509741aea7 | ||
|
|
e1df77ee1e | ||
|
|
fdb1baa05c | ||
|
|
6529ee67ec | ||
|
|
df2bc5ef28 | ||
|
|
a7bf77fc28 | ||
|
|
0f0defdb65 | ||
|
|
19df9f3ec0 | ||
|
|
d6ca120987 | ||
|
|
fb7ae0184f | ||
|
|
70f8d4b488 | ||
|
|
6c60e430ee | ||
|
|
1221b28eac | ||
|
|
746f603b20 | ||
|
|
2afea72d29 | ||
|
|
0f111ab794 | ||
|
|
4dd7aaa06f | ||
|
|
d27e996ccd | ||
|
|
72780ff5b1 | ||
|
|
69fdb8720f | ||
|
|
b2140a895b | ||
|
|
e0e8c58f64 | ||
|
|
cbea5d1725 | ||
|
|
a1245c2c61 | ||
|
|
cdda94f412 | ||
|
|
5b830aa356 | ||
|
|
9e7bae9881 | ||
|
|
b41ce1e090 | ||
|
|
95d3748453 | ||
|
|
44aa9e566d |
2
.github/workflows/benchmark.yml
vendored
2
.github/workflows/benchmark.yml
vendored
@@ -39,7 +39,7 @@ jobs:
|
||||
python utils/print_env.py
|
||||
- name: Diffusers Benchmarking
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
|
||||
BASE_PATH: benchmark_outputs
|
||||
run: |
|
||||
export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
|
||||
|
||||
32
.github/workflows/build_docker_images.yml
vendored
32
.github/workflows/build_docker_images.yml
vendored
@@ -25,17 +25,17 @@ jobs:
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
|
||||
- name: Find Changed Dockerfiles
|
||||
id: file_changes
|
||||
uses: jitterbit/get-changed-files@v1
|
||||
with:
|
||||
format: 'space-delimited'
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
|
||||
- name: Build Changed Docker Images
|
||||
run: |
|
||||
CHANGED_FILES="${{ steps.file_changes.outputs.all }}"
|
||||
@@ -52,7 +52,7 @@ jobs:
|
||||
build-and-push-docker-images:
|
||||
runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
|
||||
if: github.event_name != 'pull_request'
|
||||
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
@@ -69,6 +69,7 @@ jobs:
|
||||
- diffusers-flax-tpu
|
||||
- diffusers-onnxruntime-cpu
|
||||
- diffusers-onnxruntime-cuda
|
||||
- diffusers-doc-builder
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
@@ -90,24 +91,11 @@ jobs:
|
||||
|
||||
- name: Post to a Slack channel
|
||||
id: slack
|
||||
uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
|
||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||
with:
|
||||
# Slack channel id, channel name, or user id to post message.
|
||||
# See also: https://api.slack.com/methods/chat.postMessage#channels
|
||||
channel-id: ${{ env.CI_SLACK_CHANNEL }}
|
||||
# For posting a rich message using Block Kit
|
||||
payload: |
|
||||
{
|
||||
"text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}",
|
||||
"blocks": [
|
||||
{
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
"text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
slack_channel: ${{ env.CI_SLACK_CHANNEL }}
|
||||
title: "🤗 Results of the ${{ matrix.image-name }} Docker Image build"
|
||||
status: ${{ job.status }}
|
||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
2
.github/workflows/build_documentation.yml
vendored
2
.github/workflows/build_documentation.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
package: diffusers
|
||||
notebook_folder: diffusers_doc
|
||||
languages: en ko zh ja pt
|
||||
|
||||
custom_container: diffusers/diffusers-doc-builder
|
||||
secrets:
|
||||
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
||||
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
|
||||
|
||||
1
.github/workflows/build_pr_documentation.yml
vendored
1
.github/workflows/build_pr_documentation.yml
vendored
@@ -20,3 +20,4 @@ jobs:
|
||||
install_libgl1: true
|
||||
package: diffusers
|
||||
languages: en ko zh ja pt
|
||||
custom_container: diffusers/diffusers-doc-builder
|
||||
|
||||
89
.github/workflows/mirror_community_pipeline.yml
vendored
Normal file
89
.github/workflows/mirror_community_pipeline.yml
vendored
Normal file
@@ -0,0 +1,89 @@
|
||||
name: Mirror Community Pipeline
|
||||
|
||||
on:
|
||||
# Push changes on the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- 'examples/community/**.py'
|
||||
|
||||
# And on tag creation (e.g. `v0.28.1`)
|
||||
tags:
|
||||
- '*'
|
||||
|
||||
# Manual trigger with ref input
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
ref:
|
||||
description: "Either 'main' or a tag ref"
|
||||
required: true
|
||||
default: 'main'
|
||||
|
||||
jobs:
|
||||
mirror_community_pipeline:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
# Checkout to correct ref
|
||||
# If workflow dispatch
|
||||
# If ref is 'main', set:
|
||||
# CHECKOUT_REF=refs/heads/main
|
||||
# PATH_IN_REPO=main
|
||||
# Else it must be a tag. Set:
|
||||
# CHECKOUT_REF=refs/tags/{tag}
|
||||
# PATH_IN_REPO={tag}
|
||||
# If not workflow dispatch
|
||||
# If ref is 'refs/heads/main' => set 'main'
|
||||
# Else it must be a tag => set {tag}
|
||||
- name: Set checkout_ref and path_in_repo
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
|
||||
if [ -z "${{ github.event.inputs.ref }}" ]; then
|
||||
echo "Error: Missing ref input"
|
||||
exit 1
|
||||
elif [ "${{ github.event.inputs.ref }}" == "main" ]; then
|
||||
echo "CHECKOUT_REF=refs/heads/main" >> $GITHUB_ENV
|
||||
echo "PATH_IN_REPO=main" >> $GITHUB_ENV
|
||||
else
|
||||
echo "CHECKOUT_REF=refs/tags/${{ github.event.inputs.ref }}" >> $GITHUB_ENV
|
||||
echo "PATH_IN_REPO=${{ github.event.inputs.ref }}" >> $GITHUB_ENV
|
||||
fi
|
||||
elif [ "${{ github.ref }}" == "refs/heads/main" ]; then
|
||||
echo "CHECKOUT_REF=${{ github.ref }}" >> $GITHUB_ENV
|
||||
echo "PATH_IN_REPO=main" >> $GITHUB_ENV
|
||||
else
|
||||
# e.g. refs/tags/v0.28.1 -> v0.28.1
|
||||
echo "CHECKOUT_REF=${{ github.ref }}" >> $GITHUB_ENV
|
||||
echo "PATH_IN_REPO=$(echo ${{ github.ref }} | sed 's/^refs\/tags\///')" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: Print env vars
|
||||
run: |
|
||||
echo "CHECKOUT_REF: ${{ env.CHECKOUT_REF }}"
|
||||
echo "PATH_IN_REPO: ${{ env.PATH_IN_REPO }}"
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ env.CHECKOUT_REF }}
|
||||
|
||||
# Setup + install dependencies
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.10"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install --upgrade huggingface_hub
|
||||
|
||||
# Check secret is set
|
||||
- name: whoami
|
||||
run: huggingface-cli whoami
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}
|
||||
|
||||
# Push to HF! (under subfolder based on checkout ref)
|
||||
# https://huggingface.co/datasets/diffusers/community-pipelines-mirror
|
||||
- name: Mirror community pipeline to HF
|
||||
run: huggingface-cli upload diffusers/community-pipelines-mirror ./examples/community ${PATH_IN_REPO} --repo-type dataset
|
||||
env:
|
||||
PATH_IN_REPO: ${{ env.PATH_IN_REPO }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN_MIRROR_COMMUNITY_PIPELINES }}
|
||||
16
.github/workflows/nightly_tests.yml
vendored
16
.github/workflows/nightly_tests.yml
vendored
@@ -59,7 +59,7 @@ jobs:
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
|
||||
- name: Nightly PyTorch CUDA checkpoint (pipelines) tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -141,7 +141,7 @@ jobs:
|
||||
- name: Run nightly PyTorch CUDA tests for non-pipeline modules
|
||||
if: ${{ matrix.module != 'examples'}}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -154,7 +154,7 @@ jobs:
|
||||
- name: Run nightly example tests with Torch
|
||||
if: ${{ matrix.module == 'examples' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -211,7 +211,7 @@ jobs:
|
||||
|
||||
- name: Run nightly LoRA tests with PEFT and Torch
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -269,7 +269,7 @@ jobs:
|
||||
|
||||
- name: Run nightly Flax TPU tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
@@ -324,7 +324,7 @@ jobs:
|
||||
|
||||
- name: Run nightly ONNXRuntime CUDA tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
@@ -390,7 +390,7 @@ jobs:
|
||||
shell: arch -arch arm64 bash {0}
|
||||
env:
|
||||
HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
|
||||
--report-log=tests_torch_mps.log \
|
||||
|
||||
18
.github/workflows/pr_test_peft_backend.yml
vendored
18
.github/workflows/pr_test_peft_backend.yml
vendored
@@ -111,3 +111,21 @@ jobs:
|
||||
-s -v \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/lora/
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v \
|
||||
--make-reports=tests_models_lora_${{ matrix.config.report }} \
|
||||
tests/models/ -k "lora"
|
||||
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_${{ matrix.config.report }}_failures_short.txt
|
||||
cat reports/tests_models_lora_${{ matrix.config.report }}_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pr_${{ matrix.config.report }}_test_reports
|
||||
path: reports
|
||||
2
.github/workflows/pr_tests.yml
vendored
2
.github/workflows/pr_tests.yml
vendored
@@ -156,7 +156,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install peft
|
||||
python -m uv pip install peft timm
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples
|
||||
|
||||
37
.github/workflows/push_tests.yml
vendored
37
.github/workflows/push_tests.yml
vendored
@@ -62,7 +62,7 @@ jobs:
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -71,12 +71,6 @@ jobs:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
- name: Tailscale
|
||||
uses: huggingface/tailscale-action@v1
|
||||
with:
|
||||
authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
|
||||
slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
|
||||
slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
@@ -87,7 +81,7 @@ jobs:
|
||||
python utils/print_env.py
|
||||
- name: Slow PyTorch CUDA checkpoint tests on Ubuntu
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -95,18 +89,11 @@ jobs:
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
- name: Tailscale Wait
|
||||
if: ${{ failure() || runner.debug == '1' }}
|
||||
uses: huggingface/tailscale-action@v1
|
||||
with:
|
||||
waitForSSH: true
|
||||
authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
|
||||
cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
@@ -144,7 +131,7 @@ jobs:
|
||||
|
||||
- name: Run slow PyTorch CUDA tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -194,7 +181,7 @@ jobs:
|
||||
|
||||
- name: Run slow PEFT CUDA tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
@@ -202,12 +189,17 @@ jobs:
|
||||
-s -v -k "not Flax and not Onnx and not PEFTLoRALoading" \
|
||||
--make-reports=tests_peft_cuda \
|
||||
tests/lora/
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "lora and not Flax and not Onnx and not PEFTLoRALoading" \
|
||||
--make-reports=tests_peft_cuda_models_lora \
|
||||
tests/models/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
cat reports/tests_peft_cuda_stats.txt
|
||||
cat reports/tests_peft_cuda_failures_short.txt
|
||||
cat reports/tests_peft_cuda_models_lora_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
@@ -243,7 +235,7 @@ jobs:
|
||||
|
||||
- name: Run slow Flax TPU tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
@@ -290,7 +282,7 @@ jobs:
|
||||
|
||||
- name: Run slow ONNXRuntime CUDA tests
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
@@ -337,7 +329,7 @@ jobs:
|
||||
python utils/print_env.py
|
||||
- name: Run example tests on GPU
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
- name: Failure short reports
|
||||
@@ -378,7 +370,7 @@ jobs:
|
||||
python utils/print_env.py
|
||||
- name: Run example tests on GPU
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
|
||||
- name: Failure short reports
|
||||
@@ -423,9 +415,10 @@ jobs:
|
||||
|
||||
- name: Run example tests on GPU
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install timm
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
|
||||
- name: Failure short reports
|
||||
|
||||
2
.github/workflows/push_tests_fast.yml
vendored
2
.github/workflows/push_tests_fast.yml
vendored
@@ -107,7 +107,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install peft
|
||||
python -m uv pip install peft timm
|
||||
python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples
|
||||
|
||||
4
.github/workflows/push_tests_mps.yml
vendored
4
.github/workflows/push_tests_mps.yml
vendored
@@ -23,7 +23,7 @@ concurrency:
|
||||
jobs:
|
||||
run_fast_tests_apple_m1:
|
||||
name: Fast PyTorch MPS tests on MacOS
|
||||
runs-on: [ self-hosted, apple-m1 ]
|
||||
runs-on: macos-13-xlarge
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -59,7 +59,7 @@ jobs:
|
||||
shell: arch -arch arm64 bash {0}
|
||||
env:
|
||||
HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
|
||||
|
||||
|
||||
73
.github/workflows/run_tests_from_a_pr.yml
vendored
Normal file
73
.github/workflows/run_tests_from_a_pr.yml
vendored
Normal file
@@ -0,0 +1,73 @@
|
||||
name: Check running SLOW tests from a PR (only GPU)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
docker_image:
|
||||
default: 'diffusers/diffusers-pytorch-cuda'
|
||||
description: 'Name of the Docker image'
|
||||
required: true
|
||||
branch:
|
||||
description: 'PR Branch to test on'
|
||||
required: true
|
||||
test:
|
||||
description: 'Tests to run (e.g.: `tests/models`).'
|
||||
required: true
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
IS_GITHUB_CI: "1"
|
||||
HF_HOME: /mnt/cache
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
PYTEST_TIMEOUT: 600
|
||||
RUN_SLOW: yes
|
||||
|
||||
jobs:
|
||||
run_tests:
|
||||
name: "Run a test on our runner from a PR"
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: ${{ github.event.inputs.docker_image }}
|
||||
options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Validate test files input
|
||||
id: validate_test_files
|
||||
env:
|
||||
PY_TEST: ${{ github.event.inputs.test }}
|
||||
run: |
|
||||
if [[ ! "$PY_TEST" =~ ^tests/ ]]; then
|
||||
echo "Error: The input string must start with 'tests/'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! "$PY_TEST" =~ ^tests/(models|pipelines) ]]; then
|
||||
echo "Error: The input string must contain either 'models' or 'pipelines' after 'tests/'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "$PY_TEST" == *";"* ]]; then
|
||||
echo "Error: The input string must not contain ';'."
|
||||
exit 1
|
||||
fi
|
||||
echo "$PY_TEST"
|
||||
|
||||
- name: Checkout PR branch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.branch }}
|
||||
repository: ${{ github.event.pull_request.head.repo.full_name }}
|
||||
|
||||
|
||||
- name: Install pytest
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install peft
|
||||
|
||||
- name: Run tests
|
||||
env:
|
||||
PY_TEST: ${{ github.event.inputs.test }}
|
||||
run: |
|
||||
pytest "$PY_TEST"
|
||||
4
.github/workflows/ssh-runner.yml
vendored
4
.github/workflows/ssh-runner.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
|
||||
container:
|
||||
image: ${{ github.event.inputs.docker_image }}
|
||||
options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -38,7 +38,7 @@ jobs:
|
||||
nvidia-smi
|
||||
|
||||
- name: Tailscale # In order to be able to SSH when a test fails
|
||||
uses: huggingface/tailscale-action@v1
|
||||
uses: huggingface/tailscale-action@main
|
||||
with:
|
||||
authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
|
||||
slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
|
||||
|
||||
15
.github/workflows/trufflehog.yml
vendored
Normal file
15
.github/workflows/trufflehog.yml
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
on:
|
||||
push:
|
||||
|
||||
name: Secret Leaks
|
||||
|
||||
jobs:
|
||||
trufflehog:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Secret Scanning
|
||||
uses: trufflesecurity/trufflehog@main
|
||||
2
.github/workflows/update_metadata.yml
vendored
2
.github/workflows/update_metadata.yml
vendored
@@ -25,6 +25,6 @@ jobs:
|
||||
|
||||
- name: Update metadata
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
|
||||
HF_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
|
||||
run: |
|
||||
python utils/update_metadata.py --commit_sha ${{ github.sha }}
|
||||
|
||||
@@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi
|
||||
|
||||
## Quickstart
|
||||
|
||||
Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 22000+ checkpoints):
|
||||
Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 25.000+ checkpoints):
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
|
||||
- https://github.com/deep-floyd/IF
|
||||
- https://github.com/bentoml/BentoML
|
||||
- https://github.com/bmaltais/kohya_ss
|
||||
- +9000 other amazing GitHub repositories 💪
|
||||
- +11.000 other amazing GitHub repositories 💪
|
||||
|
||||
Thank you for using us ❤️.
|
||||
|
||||
|
||||
52
docker/diffusers-doc-builder/Dockerfile
Normal file
52
docker/diffusers-doc-builder/Dockerfile
Normal file
@@ -0,0 +1,52 @@
|
||||
FROM ubuntu:20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
libgl1 \
|
||||
zip \
|
||||
wget \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3.10 -m uv pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
invisible_watermark \
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu && \
|
||||
python3.10 -m uv pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers \
|
||||
matplotlib \
|
||||
setuptools==69.5.1
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -29,10 +29,8 @@
|
||||
title: Load community pipelines and components
|
||||
- local: using-diffusers/schedulers
|
||||
title: Load schedulers and models
|
||||
- local: using-diffusers/using_safetensors
|
||||
title: Load safetensors
|
||||
- local: using-diffusers/other-formats
|
||||
title: Load different Stable Diffusion formats
|
||||
title: Model files and layouts
|
||||
- local: using-diffusers/loading_adapters
|
||||
title: Load adapters
|
||||
- local: using-diffusers/push_to_hub
|
||||
@@ -59,6 +57,8 @@
|
||||
title: Distributed inference with multiple GPUs
|
||||
- local: using-diffusers/merge_loras
|
||||
title: Merge LoRAs
|
||||
- local: using-diffusers/scheduler_features
|
||||
title: Scheduler features
|
||||
- local: using-diffusers/callback
|
||||
title: Pipeline callbacks
|
||||
- local: using-diffusers/reusing_seeds
|
||||
@@ -68,6 +68,10 @@
|
||||
- local: using-diffusers/weighted_prompts
|
||||
title: Prompt techniques
|
||||
title: Inference techniques
|
||||
- sections:
|
||||
- local: advanced_inference/outpaint
|
||||
title: Outpainting
|
||||
title: Advanced inference
|
||||
- sections:
|
||||
- local: using-diffusers/sdxl
|
||||
title: Stable Diffusion XL
|
||||
@@ -93,6 +97,8 @@
|
||||
title: Trajectory Consistency Distillation-LoRA
|
||||
- local: using-diffusers/svd
|
||||
title: Stable Video Diffusion
|
||||
- local: using-diffusers/marigold_usage
|
||||
title: Marigold Computer Vision
|
||||
title: Specific pipeline examples
|
||||
- sections:
|
||||
- local: training/overview
|
||||
@@ -101,7 +107,8 @@
|
||||
title: Create a dataset for training
|
||||
- local: training/adapt_a_model
|
||||
title: Adapt a model to a new task
|
||||
- sections:
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: training/unconditional_training
|
||||
title: Unconditional image generation
|
||||
- local: training/text2image
|
||||
@@ -119,8 +126,8 @@
|
||||
- local: training/instructpix2pix
|
||||
title: InstructPix2Pix
|
||||
title: Models
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: training/text_inversion
|
||||
title: Textual Inversion
|
||||
- local: training/dreambooth
|
||||
@@ -134,7 +141,6 @@
|
||||
- local: training/ddpo
|
||||
title: Reinforcement learning training with DDPO
|
||||
title: Methods
|
||||
isExpanded: false
|
||||
title: Training
|
||||
- sections:
|
||||
- local: optimization/fp16
|
||||
@@ -181,7 +187,8 @@
|
||||
title: Evaluating Diffusion Models
|
||||
title: Conceptual Guides
|
||||
- sections:
|
||||
- sections:
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: api/configuration
|
||||
title: Configuration
|
||||
- local: api/logging
|
||||
@@ -189,8 +196,8 @@
|
||||
- local: api/outputs
|
||||
title: Outputs
|
||||
title: Main Classes
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: api/loaders/ip_adapter
|
||||
title: IP-Adapter
|
||||
- local: api/loaders/lora
|
||||
@@ -204,8 +211,8 @@
|
||||
- local: api/loaders/peft
|
||||
title: PEFT
|
||||
title: Loaders
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: api/models/overview
|
||||
title: Overview
|
||||
- local: api/models/unet
|
||||
@@ -231,16 +238,24 @@
|
||||
- local: api/models/consistency_decoder_vae
|
||||
title: ConsistencyDecoderVAE
|
||||
- local: api/models/transformer2d
|
||||
title: Transformer2D
|
||||
title: Transformer2DModel
|
||||
- local: api/models/pixart_transformer2d
|
||||
title: PixArtTransformer2DModel
|
||||
- local: api/models/dit_transformer2d
|
||||
title: DiTTransformer2DModel
|
||||
- local: api/models/hunyuan_transformer2d
|
||||
title: HunyuanDiT2DModel
|
||||
- local: api/models/transformer_temporal
|
||||
title: Transformer Temporal
|
||||
title: TransformerTemporalModel
|
||||
- local: api/models/sd3_transformer2d
|
||||
title: SD3Transformer2DModel
|
||||
- local: api/models/prior_transformer
|
||||
title: Prior Transformer
|
||||
title: PriorTransformer
|
||||
- local: api/models/controlnet
|
||||
title: ControlNet
|
||||
title: ControlNetModel
|
||||
title: Models
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: api/pipelines/overview
|
||||
title: Overview
|
||||
- local: api/pipelines/amused
|
||||
@@ -279,6 +294,8 @@
|
||||
title: DiffEdit
|
||||
- local: api/pipelines/dit
|
||||
title: DiT
|
||||
- local: api/pipelines/hunyuandit
|
||||
title: Hunyuan-DiT
|
||||
- local: api/pipelines/i2vgenxl
|
||||
title: I2VGen-XL
|
||||
- local: api/pipelines/pix2pix
|
||||
@@ -295,6 +312,8 @@
|
||||
title: Latent Diffusion
|
||||
- local: api/pipelines/ledits_pp
|
||||
title: LEDITS++
|
||||
- local: api/pipelines/marigold
|
||||
title: Marigold
|
||||
- local: api/pipelines/panorama
|
||||
title: MultiDiffusion
|
||||
- local: api/pipelines/musicldm
|
||||
@@ -305,6 +324,8 @@
|
||||
title: Personalized Image Animator (PIA)
|
||||
- local: api/pipelines/pixart
|
||||
title: PixArt-α
|
||||
- local: api/pipelines/pixart_sigma
|
||||
title: PixArt-Σ
|
||||
- local: api/pipelines/self_attention_guidance
|
||||
title: Self-Attention Guidance
|
||||
- local: api/pipelines/semantic_stable_diffusion
|
||||
@@ -332,6 +353,8 @@
|
||||
title: Safe Stable Diffusion
|
||||
- local: api/pipelines/stable_diffusion/stable_diffusion_2
|
||||
title: Stable Diffusion 2
|
||||
- local: api/pipelines/stable_diffusion/stable_diffusion_3
|
||||
title: Stable Diffusion 3
|
||||
- local: api/pipelines/stable_diffusion/stable_diffusion_xl
|
||||
title: Stable Diffusion XL
|
||||
- local: api/pipelines/stable_diffusion/sdxl_turbo
|
||||
@@ -364,8 +387,8 @@
|
||||
- local: api/pipelines/wuerstchen
|
||||
title: Wuerstchen
|
||||
title: Pipelines
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: api/schedulers/overview
|
||||
title: Overview
|
||||
- local: api/schedulers/cm_stochastic_iterative
|
||||
@@ -396,6 +419,8 @@
|
||||
title: EulerAncestralDiscreteScheduler
|
||||
- local: api/schedulers/euler
|
||||
title: EulerDiscreteScheduler
|
||||
- local: api/schedulers/flow_match_euler_discrete
|
||||
title: FlowMatchEulerDiscreteScheduler
|
||||
- local: api/schedulers/heun
|
||||
title: HeunDiscreteScheduler
|
||||
- local: api/schedulers/ipndm
|
||||
@@ -425,8 +450,8 @@
|
||||
- local: api/schedulers/vq_diffusion
|
||||
title: VQDiffusionScheduler
|
||||
title: Schedulers
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: api/internal_classes_overview
|
||||
title: Overview
|
||||
- local: api/attnprocessor
|
||||
@@ -442,5 +467,4 @@
|
||||
- local: api/video_processor
|
||||
title: Video Processor
|
||||
title: Internal classes
|
||||
isExpanded: false
|
||||
title: API
|
||||
|
||||
231
docs/source/en/advanced_inference/outpaint.md
Normal file
231
docs/source/en/advanced_inference/outpaint.md
Normal file
@@ -0,0 +1,231 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Outpainting
|
||||
|
||||
Outpainting extends an image beyond its original boundaries, allowing you to add, replace, or modify visual elements in an image while preserving the original image. Like [inpainting](../using-diffusers/inpaint), you want to fill the white area (in this case, the area outside of the original image) with new visual elements while keeping the original image (represented by a mask of black pixels). There are a couple of ways to outpaint, such as with a [ControlNet](https://hf.co/blog/OzzyGT/outpainting-controlnet) or with [Differential Diffusion](https://hf.co/blog/OzzyGT/outpainting-differential-diffusion).
|
||||
|
||||
This guide will show you how to outpaint with an inpainting model, ControlNet, and a ZoeDepth estimator.
|
||||
|
||||
Before you begin, make sure you have the [controlnet_aux](https://github.com/huggingface/controlnet_aux) library installed so you can use the ZoeDepth estimator.
|
||||
|
||||
```py
|
||||
!pip install -q controlnet_aux
|
||||
```
|
||||
|
||||
## Image preparation
|
||||
|
||||
Start by picking an image to outpaint with and remove the background with a Space like [BRIA-RMBG-1.4](https://hf.co/spaces/briaai/BRIA-RMBG-1.4).
|
||||
|
||||
<iframe
|
||||
src="https://briaai-bria-rmbg-1-4.hf.space"
|
||||
frameborder="0"
|
||||
width="850"
|
||||
height="450"
|
||||
></iframe>
|
||||
|
||||
For example, remove the background from this image of a pair of shoes.
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/original-jordan.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/no-background-jordan.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">background removed</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
[Stable Diffusion XL (SDXL)](../using-diffusers/sdxl) models work best with 1024x1024 images, but you can resize the image to any size as long as your hardware has enough memory to support it. The transparent background in the image should also be replaced with a white background. Create a function (like the one below) that scales and pastes the image onto a white background.
|
||||
|
||||
```py
|
||||
import random
|
||||
|
||||
import requests
|
||||
import torch
|
||||
from controlnet_aux import ZoeDetector
|
||||
from PIL import Image, ImageOps
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
ControlNetModel,
|
||||
StableDiffusionXLControlNetPipeline,
|
||||
StableDiffusionXLInpaintPipeline,
|
||||
)
|
||||
|
||||
def scale_and_paste(original_image):
|
||||
aspect_ratio = original_image.width / original_image.height
|
||||
|
||||
if original_image.width > original_image.height:
|
||||
new_width = 1024
|
||||
new_height = round(new_width / aspect_ratio)
|
||||
else:
|
||||
new_height = 1024
|
||||
new_width = round(new_height * aspect_ratio)
|
||||
|
||||
resized_original = original_image.resize((new_width, new_height), Image.LANCZOS)
|
||||
white_background = Image.new("RGBA", (1024, 1024), "white")
|
||||
x = (1024 - new_width) // 2
|
||||
y = (1024 - new_height) // 2
|
||||
white_background.paste(resized_original, (x, y), resized_original)
|
||||
|
||||
return resized_original, white_background
|
||||
|
||||
original_image = Image.open(
|
||||
requests.get(
|
||||
"https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/no-background-jordan.png",
|
||||
stream=True,
|
||||
).raw
|
||||
).convert("RGBA")
|
||||
resized_img, white_bg_image = scale_and_paste(original_image)
|
||||
```
|
||||
|
||||
To avoid adding unwanted extra details, use the ZoeDepth estimator to provide additional guidance during generation and to ensure the shoes remain consistent with the original image.
|
||||
|
||||
```py
|
||||
zoe = ZoeDetector.from_pretrained("lllyasviel/Annotators")
|
||||
image_zoe = zoe(white_bg_image, detect_resolution=512, image_resolution=1024)
|
||||
image_zoe
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/zoedepth-jordan.png"/>
|
||||
</div>
|
||||
|
||||
## Outpaint
|
||||
|
||||
Once your image is ready, you can generate content in the white area around the shoes with [controlnet-inpaint-dreamer-sdxl](https://hf.co/destitech/controlnet-inpaint-dreamer-sdxl), a SDXL ControlNet trained for inpainting.
|
||||
|
||||
Load the inpainting ControlNet, ZoeDepth model, VAE and pass them to the [`StableDiffusionXLControlNetPipeline`]. Then you can create an optional `generate_image` function (for convenience) to outpaint an initial image.
|
||||
|
||||
```py
|
||||
controlnets = [
|
||||
ControlNetModel.from_pretrained(
|
||||
"destitech/controlnet-inpaint-dreamer-sdxl", torch_dtype=torch.float16, variant="fp16"
|
||||
),
|
||||
ControlNetModel.from_pretrained(
|
||||
"diffusers/controlnet-zoe-depth-sdxl-1.0", torch_dtype=torch.float16
|
||||
),
|
||||
]
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
"SG161222/RealVisXL_V4.0", torch_dtype=torch.float16, variant="fp16", controlnet=controlnets, vae=vae
|
||||
).to("cuda")
|
||||
|
||||
def generate_image(prompt, negative_prompt, inpaint_image, zoe_image, seed: int = None):
|
||||
if seed is None:
|
||||
seed = random.randint(0, 2**32 - 1)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(seed)
|
||||
|
||||
image = pipeline(
|
||||
prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
image=[inpaint_image, zoe_image],
|
||||
guidance_scale=6.5,
|
||||
num_inference_steps=25,
|
||||
generator=generator,
|
||||
controlnet_conditioning_scale=[0.5, 0.8],
|
||||
control_guidance_end=[0.9, 0.6],
|
||||
).images[0]
|
||||
|
||||
return image
|
||||
|
||||
prompt = "nike air jordans on a basketball court"
|
||||
negative_prompt = ""
|
||||
|
||||
temp_image = generate_image(prompt, negative_prompt, white_bg_image, image_zoe, 908097)
|
||||
```
|
||||
|
||||
Paste the original image over the initial outpainted image. You'll improve the outpainted background in a later step.
|
||||
|
||||
```py
|
||||
x = (1024 - resized_img.width) // 2
|
||||
y = (1024 - resized_img.height) // 2
|
||||
temp_image.paste(resized_img, (x, y), resized_img)
|
||||
temp_image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/initial-outpaint.png"/>
|
||||
</div>
|
||||
|
||||
> [!TIP]
|
||||
> Now is a good time to free up some memory if you're running low!
|
||||
>
|
||||
> ```py
|
||||
> pipeline=None
|
||||
> torch.cuda.empty_cache()
|
||||
> ```
|
||||
|
||||
Now that you have an initial outpainted image, load the [`StableDiffusionXLInpaintPipeline`] with the [RealVisXL](https://hf.co/SG161222/RealVisXL_V4.0) model to generate the final outpainted image with better quality.
|
||||
|
||||
```py
|
||||
pipeline = StableDiffusionXLInpaintPipeline.from_pretrained(
|
||||
"OzzyGT/RealVisXL_V4.0_inpainting",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
vae=vae,
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Prepare a mask for the final outpainted image. To create a more natural transition between the original image and the outpainted background, blur the mask to help it blend better.
|
||||
|
||||
```py
|
||||
mask = Image.new("L", temp_image.size)
|
||||
mask.paste(resized_img.split()[3], (x, y))
|
||||
mask = ImageOps.invert(mask)
|
||||
final_mask = mask.point(lambda p: p > 128 and 255)
|
||||
mask_blurred = pipeline.mask_processor.blur(final_mask, blur_factor=20)
|
||||
mask_blurred
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/blurred-mask.png"/>
|
||||
</div>
|
||||
|
||||
Create a better prompt and pass it to the `generate_outpaint` function to generate the final outpainted image. Again, paste the original image over the final outpainted background.
|
||||
|
||||
```py
|
||||
def generate_outpaint(prompt, negative_prompt, image, mask, seed: int = None):
|
||||
if seed is None:
|
||||
seed = random.randint(0, 2**32 - 1)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(seed)
|
||||
|
||||
image = pipeline(
|
||||
prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
image=image,
|
||||
mask_image=mask,
|
||||
guidance_scale=10.0,
|
||||
strength=0.8,
|
||||
num_inference_steps=30,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
|
||||
return image
|
||||
|
||||
prompt = "high quality photo of nike air jordans on a basketball court, highly detailed"
|
||||
negative_prompt = ""
|
||||
|
||||
final_image = generate_outpaint(prompt, negative_prompt, temp_image, mask_blurred, 7688778)
|
||||
x = (1024 - resized_img.width) // 2
|
||||
y = (1024 - resized_img.height) // 2
|
||||
final_image.paste(resized_img, (x, y), resized_img)
|
||||
final_image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/final-outpaint.png"/>
|
||||
</div>
|
||||
@@ -10,13 +10,17 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Loading Pipelines and Models via `from_single_file`
|
||||
# Single files
|
||||
|
||||
The `from_single_file` method allows you to load supported pipelines using a single checkpoint file as opposed to the folder format used by Diffusers. This is useful if you are working with many of the Stable Diffusion Web UI's (such as A1111) that extensively rely on a single file to distribute all the components of a diffusion model.
|
||||
The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:
|
||||
|
||||
The `from_single_file` method also supports loading models in their originally distributed format. This means that supported models that have been finetuned with other services can be loaded directly into supported Diffusers model objects and pipelines.
|
||||
* a model stored in a single file, which is useful if you're working with models from the diffusion ecosystem, like Automatic1111, and commonly rely on a single-file layout to store and share models
|
||||
* a model stored in their originally distributed layout, which is useful if you're working with models finetuned with other services, and want to load it directly into Diffusers model objects and pipelines
|
||||
|
||||
## Pipelines that currently support `from_single_file` loading
|
||||
> [!TIP]
|
||||
> Read the [Model files and layouts](../../using-diffusers/other-formats) guide to learn more about the Diffusers-multifolder layout versus the single-file layout, and how to load models stored in these different layouts.
|
||||
|
||||
## Supported pipelines
|
||||
|
||||
- [`StableDiffusionPipeline`]
|
||||
- [`StableDiffusionImg2ImgPipeline`]
|
||||
@@ -39,206 +43,13 @@ The `from_single_file` method also supports loading models in their originally d
|
||||
- [`LEditsPPPipelineStableDiffusionXL`]
|
||||
- [`PIAPipeline`]
|
||||
|
||||
## Models that currently support `from_single_file` loading
|
||||
## Supported models
|
||||
|
||||
- [`UNet2DConditionModel`]
|
||||
- [`StableCascadeUNet`]
|
||||
- [`AutoencoderKL`]
|
||||
- [`ControlNetModel`]
|
||||
|
||||
## Usage Examples
|
||||
|
||||
## Loading a Pipeline using `from_single_file`
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path)
|
||||
```
|
||||
|
||||
## Setting components in a Pipeline using `from_single_file`
|
||||
|
||||
Swap components of the pipeline by passing them directly to the `from_single_file` method. e.g If you would like use a different scheduler than the pipeline default.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLPipeline, DDIMScheduler
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
|
||||
scheduler = DDIMScheduler()
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, scheduler=scheduler)
|
||||
|
||||
```
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline, ControlNetModel
|
||||
|
||||
ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors")
|
||||
pipe = StableDiffusionPipeline.from_single_file(ckpt_path, controlnet=controlnet)
|
||||
|
||||
```
|
||||
|
||||
## Loading a Model using `from_single_file`
|
||||
|
||||
```python
|
||||
from diffusers import StableCascadeUNet
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_lite.safetensors"
|
||||
model = StableCascadeUNet.from_single_file(ckpt_path)
|
||||
|
||||
```
|
||||
|
||||
## Using a Diffusers model repository to configure single file loading
|
||||
|
||||
Under the hood, `from_single_file` will try to determine a model repository to use to configure the components of the pipeline. You can also pass in a repository id to the `config` argument of the `from_single_file` method to explicitly set the repository to use.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/segmind/SSD-1B/blob/main/SSD-1B.safetensors"
|
||||
repo_id = "segmind/SSD-1B"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, config=repo_id)
|
||||
|
||||
```
|
||||
|
||||
## Override configuration options when using single file loading
|
||||
|
||||
Override the default model or pipeline configuration options when using `from_single_file` by passing in the relevant arguments directly to the `from_single_file` method. Any argument that is supported by the model or pipeline class can be configured in this way:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLInstructPix2PixPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/cosxl/blob/main/cosxl_edit.safetensors"
|
||||
pipe = StableDiffusionXLInstructPix2PixPipeline.from_single_file(ckpt_path, config="diffusers/sdxl-instructpix2pix-768", is_cosxl_edit=True)
|
||||
|
||||
```
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DConditionModel
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
model = UNet2DConditionModel.from_single_file(ckpt_path, upcast_attention=True)
|
||||
|
||||
```
|
||||
|
||||
In the example above, since we explicitly passed `repo_id="segmind/SSD-1B"`, it will use this [configuration file](https://huggingface.co/segmind/SSD-1B/blob/main/unet/config.json) from the "unet" subfolder in `"segmind/SSD-1B"` to configure the unet component included in the checkpoint; Similarly, it will use the `config.json` file from `"vae"` subfolder to configure the vae model, `config.json` file from text_encoder folder to configure text_encoder and so on.
|
||||
|
||||
Note that most of the time you do not need to explicitly a `config` argument, `from_single_file` will automatically map the checkpoint to a repo id (we will discuss this in more details in next section). However, this can be useful in cases where model components might have been changed from what was originally distributed or in cases where a checkpoint file might not have the necessary metadata to correctly determine the configuration to use for the pipeline.
|
||||
|
||||
<Tip>
|
||||
|
||||
To learn more about how to load single file weights, see the [Load different Stable Diffusion formats](../../using-diffusers/other-formats) loading guide.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Working with local files
|
||||
|
||||
As of `diffusers>=0.28.0` the `from_single_file` method will attempt to configure a pipeline or model by first inferring the model type from the checkpoint file and then using the model type to determine the appropriate model repo configuration to use from the Hugging Face Hub. For example, any single file checkpoint based on the Stable Diffusion XL base model will use the [`stabilityai/stable-diffusion-xl-base-1.0`](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) model repo to configure the pipeline.
|
||||
|
||||
If you are working in an environment with restricted internet access, it is recommended to download the config files and checkpoints for the model to your preferred directory and pass the local paths to the `pretrained_model_link_or_path` and `config` arguments of the `from_single_file` method.
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
|
||||
my_local_checkpoint_path = hf_hub_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
filename="SSD-1B.safetensors"
|
||||
)
|
||||
|
||||
my_local_config_path = snapshot_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
|
||||
)
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
|
||||
|
||||
```
|
||||
|
||||
By default this will download the checkpoints and config files to the [Hugging Face Hub cache directory](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache). You can also specify a local directory to download the files to by passing the `local_dir` argument to the `hf_hub_download` and `snapshot_download` functions.
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
|
||||
my_local_checkpoint_path = hf_hub_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
filename="SSD-1B.safetensors"
|
||||
local_dir="my_local_checkpoints"
|
||||
)
|
||||
|
||||
my_local_config_path = snapshot_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
|
||||
local_dir="my_local_config"
|
||||
)
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
|
||||
|
||||
```
|
||||
|
||||
## Working with local files on file systems that do not support symlinking
|
||||
|
||||
By default the `from_single_file` method relies on the `huggingface_hub` caching mechanism to fetch and store checkpoints and config files for models and pipelines. If you are working with a file system that does not support symlinking, it is recommended that you first download the checkpoint file to a local directory and disable symlinking by passing the `local_dir_use_symlink=False` argument to the `hf_hub_download` and `snapshot_download` functions.
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
|
||||
my_local_checkpoint_path = hf_hub_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
filename="SSD-1B.safetensors"
|
||||
local_dir="my_local_checkpoints",
|
||||
local_dir_use_symlinks=False
|
||||
)
|
||||
print("My local checkpoint: ", my_local_checkpoint_path)
|
||||
|
||||
my_local_config_path = snapshot_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
|
||||
local_dir_use_symlinks=False,
|
||||
)
|
||||
print("My local config: ", my_local_config_path)
|
||||
|
||||
```
|
||||
|
||||
Then pass the local paths to the `pretrained_model_link_or_path` and `config` arguments of the `from_single_file` method.
|
||||
|
||||
```python
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
|
||||
|
||||
```
|
||||
|
||||
<Tip>
|
||||
Disabling symlinking means that the `huggingface_hub` caching mechanism has no way to determine whether a file has already been downloaded to the local directory. This means that the `hf_hub_download` and `snapshot_download` functions will download files to the local directory each time they are executed. If you are disabling symlinking, it is recommended that you separate the model download and loading steps to avoid downloading the same file multiple times.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Using the original configuration file of a model
|
||||
|
||||
If you would like to configure the parameters of the model components in the pipeline using the orignal YAML configuration file, you can pass a local path or url to the original configuration file to the `original_config` argument of the `from_single_file` method.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
original_config = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, original_config=original_config)
|
||||
```
|
||||
|
||||
In the example above, the `original_config` file is only used to configure the parameters of the individual model components of the pipeline. For example it will be used to configure parameters such as the `in_channels` of the `vae` model and `unet` model. It is not used to determine the type of component objects in the pipeline.
|
||||
|
||||
|
||||
<Tip>
|
||||
When using `original_config` with local_files_only=True`, Diffusers will attempt to infer the components based on the type signatures of pipeline class, rather than attempting to fetch the pipeline config from the Hugging Face Hub. This is to prevent backwards breaking changes in existing code that might not be able to connect to the internet to fetch the necessary pipeline config files.
|
||||
|
||||
This is not as reliable as providing a path to a local config repo and might lead to errors when configuring the pipeline. To avoid this, please run the pipeline with `local_files_only=False` once to download the appropriate pipeline config files to the local cache.
|
||||
</Tip>
|
||||
|
||||
|
||||
## FromSingleFileMixin
|
||||
|
||||
[[autodoc]] loaders.single_file.FromSingleFileMixin
|
||||
|
||||
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# ControlNet
|
||||
# ControlNetModel
|
||||
|
||||
The ControlNet model was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, Maneesh Agrawala. It provides a greater degree of control over text-to-image generation by conditioning the model on additional inputs such as edge maps, depth maps, segmentation maps, and keypoints for pose detection.
|
||||
|
||||
|
||||
19
docs/source/en/api/models/dit_transformer2d.md
Normal file
19
docs/source/en/api/models/dit_transformer2d.md
Normal file
@@ -0,0 +1,19 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# DiTTransformer2DModel
|
||||
|
||||
A Transformer model for image-like data from [DiT](https://huggingface.co/papers/2212.09748).
|
||||
|
||||
## DiTTransformer2DModel
|
||||
|
||||
[[autodoc]] DiTTransformer2DModel
|
||||
20
docs/source/en/api/models/hunyuan_transformer2d.md
Normal file
20
docs/source/en/api/models/hunyuan_transformer2d.md
Normal file
@@ -0,0 +1,20 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# HunyuanDiT2DModel
|
||||
|
||||
A Diffusion Transformer model for 2D data from [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT).
|
||||
|
||||
## HunyuanDiT2DModel
|
||||
|
||||
[[autodoc]] HunyuanDiT2DModel
|
||||
|
||||
19
docs/source/en/api/models/pixart_transformer2d.md
Normal file
19
docs/source/en/api/models/pixart_transformer2d.md
Normal file
@@ -0,0 +1,19 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# PixArtTransformer2DModel
|
||||
|
||||
A Transformer model for image-like data from [PixArt-Alpha](https://huggingface.co/papers/2310.00426) and [PixArt-Sigma](https://huggingface.co/papers/2403.04692).
|
||||
|
||||
## PixArtTransformer2DModel
|
||||
|
||||
[[autodoc]] PixArtTransformer2DModel
|
||||
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Prior Transformer
|
||||
# PriorTransformer
|
||||
|
||||
The Prior Transformer was originally introduced in [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://huggingface.co/papers/2204.06125) by Ramesh et al. It is used to predict CLIP image embeddings from CLIP text embeddings; image embeddings are predicted through a denoising diffusion process.
|
||||
|
||||
|
||||
19
docs/source/en/api/models/sd3_transformer2d.md
Normal file
19
docs/source/en/api/models/sd3_transformer2d.md
Normal file
@@ -0,0 +1,19 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# SD3 Transformer Model
|
||||
|
||||
The Transformer model introduced in [Stable Diffusion 3](https://hf.co/papers/2403.03206). Its novelty lies in the MMDiT transformer block.
|
||||
|
||||
## SD3Transformer2DModel
|
||||
|
||||
[[autodoc]] SD3Transformer2DModel
|
||||
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Transformer2D
|
||||
# Transformer2DModel
|
||||
|
||||
A Transformer model for image-like data from [CompVis](https://huggingface.co/CompVis) that is based on the [Vision Transformer](https://huggingface.co/papers/2010.11929) introduced by Dosovitskiy et al. The [`Transformer2DModel`] accepts discrete (classes of vector embeddings) or continuous (actual embeddings) inputs.
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Transformer Temporal
|
||||
# TransformerTemporalModel
|
||||
|
||||
A Transformer model for video-like data.
|
||||
|
||||
|
||||
@@ -24,4 +24,4 @@ The abstract from the paper is:
|
||||
|
||||
## VQEncoderOutput
|
||||
|
||||
[[autodoc]] models.vq_model.VQEncoderOutput
|
||||
[[autodoc]] models.autoencoders.vq_model.VQEncoderOutput
|
||||
|
||||
@@ -16,7 +16,7 @@ aMUSEd was introduced in [aMUSEd: An Open MUSE Reproduction](https://huggingface
|
||||
|
||||
Amused is a lightweight text to image model based off of the [MUSE](https://arxiv.org/abs/2301.00704) architecture. Amused is particularly useful in applications that require a lightweight and fast model such as generating many images quickly at once.
|
||||
|
||||
Amused is a vqvae token based transformer that can generate an image in fewer forward passes than many diffusion models. In contrast with muse, it uses the smaller text encoder CLIP-L/14 instead of t5-xxl. Due to its small parameter count and few forward pass generation process, amused can generate many images quickly. This benefit is seen particularly at larger batch sizes.
|
||||
Amused is a vqvae token based transformer that can generate an image in fewer forward passes than many diffusion models. In contrast with muse, it uses the smaller text encoder CLIP-L/14 instead of t5-xxl. Due to its small parameter count and few forward pass generation process, amused can generate many images quickly. This benefit is seen particularly at larger batch sizes.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
|
||||
@@ -165,7 +165,7 @@ from PIL import Image
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
|
||||
# load SD 1.5 based finetuned model
|
||||
model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
|
||||
pipe = AnimateDiffVideoToVideoPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16).to("cuda")
|
||||
pipe = AnimateDiffVideoToVideoPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)
|
||||
scheduler = DDIMScheduler.from_pretrained(
|
||||
model_id,
|
||||
subfolder="scheduler",
|
||||
|
||||
95
docs/source/en/api/pipelines/hunyuandit.md
Normal file
95
docs/source/en/api/pipelines/hunyuandit.md
Normal file
@@ -0,0 +1,95 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Hunyuan-DiT
|
||||

|
||||
|
||||
[Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding](https://arxiv.org/abs/2405.08748) from Tencent Hunyuan.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*We present Hunyuan-DiT, a text-to-image diffusion transformer with fine-grained understanding of both English and Chinese. To construct Hunyuan-DiT, we carefully design the transformer structure, text encoder, and positional encoding. We also build from scratch a whole data pipeline to update and evaluate data for iterative model optimization. For fine-grained language understanding, we train a Multimodal Large Language Model to refine the captions of the images. Finally, Hunyuan-DiT can perform multi-turn multimodal dialogue with users, generating and refining images according to the context. Through our holistic human evaluation protocol with more than 50 professional human evaluators, Hunyuan-DiT sets a new state-of-the-art in Chinese-to-image generation compared with other open-source models.*
|
||||
|
||||
|
||||
You can find the original codebase at [Tencent/HunyuanDiT](https://github.com/Tencent/HunyuanDiT) and all the available checkpoints at [Tencent-Hunyuan](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT).
|
||||
|
||||
**Highlights**: HunyuanDiT supports Chinese/English-to-image, multi-resolution generation.
|
||||
|
||||
HunyuanDiT has the following components:
|
||||
* It uses a diffusion transformer as the backbone
|
||||
* It combines two text encoders, a bilingual CLIP and a multilingual T5 encoder
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Optimization
|
||||
|
||||
You can optimize the pipeline's runtime and memory consumption with torch.compile and feed-forward chunking. To learn about other optimization methods, check out the [Speed up inference](../../optimization/fp16) and [Reduce memory usage](../../optimization/memory) guides.
|
||||
|
||||
### Inference
|
||||
|
||||
Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) to reduce the inference latency.
|
||||
|
||||
First, load the pipeline:
|
||||
|
||||
```python
|
||||
from diffusers import HunyuanDiTPipeline
|
||||
import torch
|
||||
|
||||
pipeline = HunyuanDiTPipeline.from_pretrained(
|
||||
"Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`:
|
||||
|
||||
```python
|
||||
pipeline.transformer.to(memory_format=torch.channels_last)
|
||||
pipeline.vae.to(memory_format=torch.channels_last)
|
||||
```
|
||||
|
||||
Finally, compile the components and run inference:
|
||||
|
||||
```python
|
||||
pipeline.transformer = torch.compile(pipeline.transformer, mode="max-autotune", fullgraph=True)
|
||||
pipeline.vae.decode = torch.compile(pipeline.vae.decode, mode="max-autotune", fullgraph=True)
|
||||
|
||||
image = pipeline(prompt="一个宇航员在骑马").images[0]
|
||||
```
|
||||
|
||||
The [benchmark](https://gist.github.com/sayakpaul/29d3a14905cfcbf611fe71ebd22e9b23) results on a 80GB A100 machine are:
|
||||
|
||||
```bash
|
||||
With torch.compile(): Average inference time: 12.470 seconds.
|
||||
Without torch.compile(): Average inference time: 20.570 seconds.
|
||||
```
|
||||
|
||||
### Memory optimization
|
||||
|
||||
By loading the T5 text encoder in 8 bits, you can run the pipeline in just under 6 GBs of GPU VRAM. Refer to [this script](https://gist.github.com/sayakpaul/3154605f6af05b98a41081aaba5ca43e) for details.
|
||||
|
||||
Furthermore, you can use the [`~HunyuanDiT2DModel.enable_forward_chunking`] method to reduce memory usage. Feed-forward chunking runs the feed-forward layers in a transformer block in a loop instead of all at once. This gives you a trade-off between memory consumption and inference runtime.
|
||||
|
||||
```diff
|
||||
+ pipeline.transformer.enable_forward_chunking(chunk_size=1, dim=1)
|
||||
```
|
||||
|
||||
|
||||
## HunyuanDiTPipeline
|
||||
|
||||
[[autodoc]] HunyuanDiTPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
@@ -47,6 +47,7 @@ Sample output with I2VGenXL:
|
||||
* Unlike SVD, it additionally accepts text prompts as inputs.
|
||||
* It can generate higher resolution videos.
|
||||
* When using the [`DDIMScheduler`] (which is default for this pipeline), less than 50 steps for inference leads to bad results.
|
||||
* This implementation is 1-stage variant of I2VGenXL. The main figure in the [I2VGen-XL](https://arxiv.org/abs/2311.04145) paper shows a 2-stage variant, however, 1-stage variant works well. See [this discussion](https://github.com/huggingface/diffusers/discussions/7952) for more details.
|
||||
|
||||
## I2VGenXLPipeline
|
||||
[[autodoc]] I2VGenXLPipeline
|
||||
|
||||
@@ -11,12 +11,12 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
Kandinsky 3 is created by [Vladimir Arkhipkin](https://github.com/oriBetelgeuse),[Anastasia Maltseva](https://github.com/NastyaMittseva),[Igor Pavlov](https://github.com/boomb0om),[Andrei Filatov](https://github.com/anvilarth),[Arseniy Shakhmatov](https://github.com/cene555),[Andrey Kuznetsov](https://github.com/kuznetsoffandrey),[Denis Dimitrov](https://github.com/denndimitrov), [Zein Shaheen](https://github.com/zeinsh)
|
||||
|
||||
The description from it's Github page:
|
||||
The description from it's Github page:
|
||||
|
||||
*Kandinsky 3.0 is an open-source text-to-image diffusion model built upon the Kandinsky2-x model family. In comparison to its predecessors, enhancements have been made to the text understanding and visual quality of the model, achieved by increasing the size of the text encoder and Diffusion U-Net models, respectively.*
|
||||
|
||||
Its architecture includes 3 main components:
|
||||
1. [FLAN-UL2](https://huggingface.co/google/flan-ul2), which is an encoder decoder model based on the T5 architecture.
|
||||
1. [FLAN-UL2](https://huggingface.co/google/flan-ul2), which is an encoder decoder model based on the T5 architecture.
|
||||
2. New U-Net architecture featuring BigGAN-deep blocks doubles depth while maintaining the same number of parameters.
|
||||
3. Sber-MoVQGAN is a decoder proven to have superior results in image restoration.
|
||||
|
||||
|
||||
@@ -25,11 +25,11 @@ You can find additional information about LEDITS++ on the [project page](https:/
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
Due to some backward compatability issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
|
||||
This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp).
|
||||
Due to some backward compatability issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
|
||||
This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp).
|
||||
</Tip>
|
||||
|
||||
We provide two distinct pipelines based on different pre-trained models.
|
||||
We provide two distinct pipelines based on different pre-trained models.
|
||||
|
||||
## LEditsPPPipelineStableDiffusion
|
||||
[[autodoc]] pipelines.ledits_pp.LEditsPPPipelineStableDiffusion
|
||||
|
||||
76
docs/source/en/api/pipelines/marigold.md
Normal file
76
docs/source/en/api/pipelines/marigold.md
Normal file
@@ -0,0 +1,76 @@
|
||||
<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Marigold Pipelines for Computer Vision Tasks
|
||||
|
||||

|
||||
|
||||
Marigold was proposed in [Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation](https://huggingface.co/papers/2312.02145), a CVPR 2024 Oral paper by [Bingxin Ke](http://www.kebingxin.com/), [Anton Obukhov](https://www.obukhov.ai/), [Shengyu Huang](https://shengyuh.github.io/), [Nando Metzger](https://nandometzger.github.io/), [Rodrigo Caye Daudt](https://rcdaudt.github.io/), and [Konrad Schindler](https://scholar.google.com/citations?user=FZuNgqIAAAAJ&hl=en).
|
||||
The idea is to repurpose the rich generative prior of Text-to-Image Latent Diffusion Models (LDMs) for traditional computer vision tasks.
|
||||
Initially, this idea was explored to fine-tune Stable Diffusion for Monocular Depth Estimation, as shown in the teaser above.
|
||||
Later,
|
||||
- [Tianfu Wang](https://tianfwang.github.io/) trained the first Latent Consistency Model (LCM) of Marigold, which unlocked fast single-step inference;
|
||||
- [Kevin Qu](https://www.linkedin.com/in/kevin-qu-b3417621b/?locale=en_US) extended the approach to Surface Normals Estimation;
|
||||
- [Anton Obukhov](https://www.obukhov.ai/) contributed the pipelines and documentation into diffusers (enabled and supported by [YiYi Xu](https://yiyixuxu.github.io/) and [Sayak Paul](https://sayak.dev/)).
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*Monocular depth estimation is a fundamental computer vision task. Recovering 3D depth from a single image is geometrically ill-posed and requires scene understanding, so it is not surprising that the rise of deep learning has led to a breakthrough. The impressive progress of monocular depth estimators has mirrored the growth in model capacity, from relatively modest CNNs to large Transformer architectures. Still, monocular depth estimators tend to struggle when presented with images with unfamiliar content and layout, since their knowledge of the visual world is restricted by the data seen during training, and challenged by zero-shot generalization to new domains. This motivates us to explore whether the extensive priors captured in recent generative diffusion models can enable better, more generalizable depth estimation. We introduce Marigold, a method for affine-invariant monocular depth estimation that is derived from Stable Diffusion and retains its rich prior knowledge. The estimator can be fine-tuned in a couple of days on a single GPU using only synthetic training data. It delivers state-of-the-art performance across a wide range of datasets, including over 20% performance gains in specific cases. Project page: https://marigoldmonodepth.github.io.*
|
||||
|
||||
## Available Pipelines
|
||||
|
||||
Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image.
|
||||
Currently, the following tasks are implemented:
|
||||
|
||||
| Pipeline | Predicted Modalities | Demos |
|
||||
|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
|
||||
| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py) | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
|
||||
| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm) |
|
||||
|
||||
|
||||
## Available Checkpoints
|
||||
|
||||
The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization.
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section [here](../../using-diffusers/svd#reduce-memory-usage).
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Marigold pipelines were designed and tested only with `DDIMScheduler` and `LCMScheduler`.
|
||||
Depending on the scheduler, the number of inference steps required to get reliable predictions varies, and there is no universal value that works best across schedulers.
|
||||
Because of that, the default value of `num_inference_steps` in the `__call__` method of the pipeline is set to `None` (see the API reference).
|
||||
Unless set explicitly, its value will be taken from the checkpoint configuration `model_index.json`.
|
||||
This is done to ensure high-quality predictions when calling the pipeline with just the `image` argument.
|
||||
|
||||
</Tip>
|
||||
|
||||
See also Marigold [usage examples](marigold_usage).
|
||||
|
||||
## MarigoldDepthPipeline
|
||||
[[autodoc]] MarigoldDepthPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## MarigoldNormalsPipeline
|
||||
[[autodoc]] MarigoldNormalsPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## MarigoldDepthOutput
|
||||
[[autodoc]] pipelines.marigold.pipeline_marigold_depth.MarigoldDepthOutput
|
||||
|
||||
## MarigoldNormalsOutput
|
||||
[[autodoc]] pipelines.marigold.pipeline_marigold_normals.MarigoldNormalsOutput
|
||||
@@ -31,13 +31,13 @@ Some notes about this pipeline:
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Inference with under 8GB GPU VRAM
|
||||
|
||||
Run the [`PixArtAlphaPipeline`] with under 8GB GPU VRAM by loading the text encoder in 8-bit precision. Let's walk through a full-fledged example.
|
||||
Run the [`PixArtAlphaPipeline`] with under 8GB GPU VRAM by loading the text encoder in 8-bit precision. Let's walk through a full-fledged example.
|
||||
|
||||
First, install the [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) library:
|
||||
|
||||
@@ -75,10 +75,10 @@ with torch.no_grad():
|
||||
prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt)
|
||||
```
|
||||
|
||||
Since text embeddings have been computed, remove the `text_encoder` and `pipe` from the memory, and free up som GPU VRAM:
|
||||
Since text embeddings have been computed, remove the `text_encoder` and `pipe` from the memory, and free up some GPU VRAM:
|
||||
|
||||
```python
|
||||
import gc
|
||||
import gc
|
||||
|
||||
def flush():
|
||||
gc.collect()
|
||||
@@ -99,7 +99,7 @@ pipe = PixArtAlphaPipeline.from_pretrained(
|
||||
).to("cuda")
|
||||
|
||||
latents = pipe(
|
||||
negative_prompt=None,
|
||||
negative_prompt=None,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_embeds,
|
||||
prompt_attention_mask=prompt_attention_mask,
|
||||
@@ -146,4 +146,3 @@ While loading the `text_encoder`, you set `load_in_8bit` to `True`. You could al
|
||||
[[autodoc]] PixArtAlphaPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
149
docs/source/en/api/pipelines/pixart_sigma.md
Normal file
149
docs/source/en/api/pipelines/pixart_sigma.md
Normal file
@@ -0,0 +1,149 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# PixArt-Σ
|
||||
|
||||

|
||||
|
||||
[PixArt-Σ: Weak-to-Strong Training of Diffusion Transformer for 4K Text-to-Image Generation](https://huggingface.co/papers/2403.04692) is Junsong Chen, Jincheng Yu, Chongjian Ge, Lewei Yao, Enze Xie, Yue Wu, Zhongdao Wang, James Kwok, Ping Luo, Huchuan Lu, and Zhenguo Li.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*In this paper, we introduce PixArt-Σ, a Diffusion Transformer model (DiT) capable of directly generating images at 4K resolution. PixArt-Σ represents a significant advancement over its predecessor, PixArt-α, offering images of markedly higher fidelity and improved alignment with text prompts. A key feature of PixArt-Σ is its training efficiency. Leveraging the foundational pre-training of PixArt-α, it evolves from the ‘weaker’ baseline to a ‘stronger’ model via incorporating higher quality data, a process we term “weak-to-strong training”. The advancements in PixArt-Σ are twofold: (1) High-Quality Training Data: PixArt-Σ incorporates superior-quality image data, paired with more precise and detailed image captions. (2) Efficient Token Compression: we propose a novel attention module within the DiT framework that compresses both keys and values, significantly improving efficiency and facilitating ultra-high-resolution image generation. Thanks to these improvements, PixArt-Σ achieves superior image quality and user prompt adherence capabilities with significantly smaller model size (0.6B parameters) than existing text-to-image diffusion models, such as SDXL (2.6B parameters) and SD Cascade (5.1B parameters). Moreover, PixArt-Σ’s capability to generate 4K images supports the creation of high-resolution posters and wallpapers, efficiently bolstering the production of highquality visual content in industries such as film and gaming.*
|
||||
|
||||
You can find the original codebase at [PixArt-alpha/PixArt-sigma](https://github.com/PixArt-alpha/PixArt-sigma) and all the available checkpoints at [PixArt-alpha](https://huggingface.co/PixArt-alpha).
|
||||
|
||||
Some notes about this pipeline:
|
||||
|
||||
* It uses a Transformer backbone (instead of a UNet) for denoising. As such it has a similar architecture as [DiT](https://hf.co/docs/transformers/model_doc/dit).
|
||||
* It was trained using text conditions computed from T5. This aspect makes the pipeline better at following complex text prompts with intricate details.
|
||||
* It is good at producing high-resolution images at different aspect ratios. To get the best results, the authors recommend some size brackets which can be found [here](https://github.com/PixArt-alpha/PixArt-sigma/blob/master/diffusion/data/datasets/utils.py).
|
||||
* It rivals the quality of state-of-the-art text-to-image generation systems (as of this writing) such as PixArt-α, Stable Diffusion XL, Playground V2.0 and DALL-E 3, while being more efficient than them.
|
||||
* It shows the ability of generating super high resolution images, such as 2048px or even 4K.
|
||||
* It shows that text-to-image models can grow from a weak model to a stronger one through several improvements (VAEs, datasets, and so on.)
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Inference with under 8GB GPU VRAM
|
||||
|
||||
Run the [`PixArtSigmaPipeline`] with under 8GB GPU VRAM by loading the text encoder in 8-bit precision. Let's walk through a full-fledged example.
|
||||
|
||||
First, install the [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) library:
|
||||
|
||||
```bash
|
||||
pip install -U bitsandbytes
|
||||
```
|
||||
|
||||
Then load the text encoder in 8-bit:
|
||||
|
||||
```python
|
||||
from transformers import T5EncoderModel
|
||||
from diffusers import PixArtSigmaPipeline
|
||||
import torch
|
||||
|
||||
text_encoder = T5EncoderModel.from_pretrained(
|
||||
"PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
|
||||
subfolder="text_encoder",
|
||||
load_in_8bit=True,
|
||||
device_map="auto",
|
||||
)
|
||||
pipe = PixArtSigmaPipeline.from_pretrained(
|
||||
"PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
|
||||
text_encoder=text_encoder,
|
||||
transformer=None,
|
||||
device_map="balanced"
|
||||
)
|
||||
```
|
||||
|
||||
Now, use the `pipe` to encode a prompt:
|
||||
|
||||
```python
|
||||
with torch.no_grad():
|
||||
prompt = "cute cat"
|
||||
prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt)
|
||||
```
|
||||
|
||||
Since text embeddings have been computed, remove the `text_encoder` and `pipe` from the memory, and free up some GPU VRAM:
|
||||
|
||||
```python
|
||||
import gc
|
||||
|
||||
def flush():
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
del text_encoder
|
||||
del pipe
|
||||
flush()
|
||||
```
|
||||
|
||||
Then compute the latents with the prompt embeddings as inputs:
|
||||
|
||||
```python
|
||||
pipe = PixArtSigmaPipeline.from_pretrained(
|
||||
"PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
|
||||
text_encoder=None,
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
latents = pipe(
|
||||
negative_prompt=None,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_embeds,
|
||||
prompt_attention_mask=prompt_attention_mask,
|
||||
negative_prompt_attention_mask=negative_prompt_attention_mask,
|
||||
num_images_per_prompt=1,
|
||||
output_type="latent",
|
||||
).images
|
||||
|
||||
del pipe.transformer
|
||||
flush()
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded.
|
||||
|
||||
</Tip>
|
||||
|
||||
Once the latents are computed, pass it off to the VAE to decode into a real image:
|
||||
|
||||
```python
|
||||
with torch.no_grad():
|
||||
image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
|
||||
image = pipe.image_processor.postprocess(image, output_type="pil")[0]
|
||||
image.save("cat.png")
|
||||
```
|
||||
|
||||
By deleting components you aren't using and flushing the GPU VRAM, you should be able to run [`PixArtSigmaPipeline`] with under 8GB GPU VRAM.
|
||||
|
||||

|
||||
|
||||
If you want a report of your memory-usage, run this [script](https://gist.github.com/sayakpaul/3ae0f847001d342af27018a96f467e4e).
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Text embeddings computed in 8-bit can impact the quality of the generated images because of the information loss in the representation space caused by the reduced precision. It's recommended to compare the outputs with and without 8-bit.
|
||||
|
||||
</Tip>
|
||||
|
||||
While loading the `text_encoder`, you set `load_in_8bit` to `True`. You could also specify `load_in_4bit` to bring your memory requirements down even further to under 7GB.
|
||||
|
||||
## PixArtSigmaPipeline
|
||||
|
||||
[[autodoc]] PixArtSigmaPipeline
|
||||
- all
|
||||
- __call__
|
||||
@@ -177,7 +177,7 @@ inpaint = StableDiffusionInpaintPipeline(**text2img.components)
|
||||
|
||||
The Stable Diffusion pipelines are automatically supported in [Gradio](https://github.com/gradio-app/gradio/), a library that makes creating beautiful and user-friendly machine learning apps on the web a breeze. First, make sure you have Gradio installed:
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install -U gradio
|
||||
```
|
||||
|
||||
@@ -209,4 +209,4 @@ gr.Interface.from_pipeline(pipe).launch()
|
||||
```
|
||||
|
||||
By default, the web demo runs on a local server. If you'd like to share it with others, you can generate a temporary public
|
||||
link by setting `share=True` in `launch()`. Or, you can host your demo on [Hugging Face Spaces](https://huggingface.co/spaces)https://huggingface.co/spaces for a permanent link.
|
||||
link by setting `share=True` in `launch()`. Or, you can host your demo on [Hugging Face Spaces](https://huggingface.co/spaces)https://huggingface.co/spaces for a permanent link.
|
||||
@@ -0,0 +1,230 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Stable Diffusion 3
|
||||
|
||||
Stable Diffusion 3 (SD3) was proposed in [Scaling Rectified Flow Transformers for High-Resolution Image Synthesis](https://arxiv.org/pdf/2403.03206.pdf) by Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas Muller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, Dustin Podell, Tim Dockhorn, Zion English, Kyle Lacey, Alex Goodwin, Yannik Marek, and Robin Rombach.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*Diffusion models create data from noise by inverting the forward paths of data towards noise and have emerged as a powerful generative modeling technique for high-dimensional, perceptual data such as images and videos. Rectified flow is a recent generative model formulation that connects data and noise in a straight line. Despite its better theoretical properties and conceptual simplicity, it is not yet decisively established as standard practice. In this work, we improve existing noise sampling techniques for training rectified flow models by biasing them towards perceptually relevant scales. Through a large-scale study, we demonstrate the superior performance of this approach compared to established diffusion formulations for high-resolution text-to-image synthesis. Additionally, we present a novel transformer-based architecture for text-to-image generation that uses separate weights for the two modalities and enables a bidirectional flow of information between image and text tokens, improving text comprehension typography, and human preference ratings. We demonstrate that this architecture follows predictable scaling trends and correlates lower validation loss to improved text-to-image synthesis as measured by various metrics and human evaluations.*
|
||||
|
||||
|
||||
## Usage Example
|
||||
|
||||
_As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers), fill in the form and accept the gate. Once you are in, you need to login so that your system knows you’ve accepted the gate._
|
||||
|
||||
Use the command below to log in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
The SD3 pipeline uses three text encoders to generate an image. Model offloading is necessary in order for it to run on most commodity hardware. Please use the `torch.float16` data type for additional memory savings.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusion3Pipeline
|
||||
|
||||
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
||||
pipe.to("cuda")
|
||||
|
||||
image = pipe(
|
||||
prompt="a photo of a cat holding a sign that says hello world",
|
||||
negative_prompt="",
|
||||
num_inference_steps=28,
|
||||
height=1024,
|
||||
width=1024,
|
||||
guidance_scale=7.0,
|
||||
).images[0]
|
||||
|
||||
image.save("sd3_hello_world.png")
|
||||
```
|
||||
|
||||
## Memory Optimisations for SD3
|
||||
|
||||
SD3 uses three text encoders, one if which is the very large T5-XXL model. This makes it challenging to run the model on GPUs with less than 24GB of VRAM, even when using `fp16` precision. The following section outlines a few memory optimizations in Diffusers that make it easier to run SD3 on low resource hardware.
|
||||
|
||||
### Running Inference with Model Offloading
|
||||
|
||||
The most basic memory optimization available in Diffusers allows you to offload the components of the model to CPU during inference in order to save memory, while seeing a slight increase in inference latency. Model offloading will only move a model component onto the GPU when it needs to be executed, while keeping the remaining components on the CPU.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusion3Pipeline
|
||||
|
||||
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
image = pipe(
|
||||
prompt="a photo of a cat holding a sign that says hello world",
|
||||
negative_prompt="",
|
||||
num_inference_steps=28,
|
||||
height=1024,
|
||||
width=1024,
|
||||
guidance_scale=7.0,
|
||||
).images[0]
|
||||
|
||||
image.save("sd3_hello_world.png")
|
||||
```
|
||||
|
||||
### Dropping the T5 Text Encoder during Inference
|
||||
|
||||
Removing the memory-intensive 4.7B parameter T5-XXL text encoder during inference can significantly decrease the memory requirements for SD3 with only a slight loss in performance.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusion3Pipeline
|
||||
|
||||
pipe = StableDiffusion3Pipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-3-medium-diffusers",
|
||||
text_encoder_3=None,
|
||||
tokenizer_3=None,
|
||||
torch_dtype=torch.float16
|
||||
)
|
||||
pipe.to("cuda")
|
||||
|
||||
image = pipe(
|
||||
prompt="a photo of a cat holding a sign that says hello world",
|
||||
negative_prompt="",
|
||||
num_inference_steps=28,
|
||||
height=1024,
|
||||
width=1024,
|
||||
guidance_scale=7.0,
|
||||
).images[0]
|
||||
|
||||
image.save("sd3_hello_world-no-T5.png")
|
||||
```
|
||||
|
||||
### Using a Quantized Version of the T5 Text Encoder
|
||||
|
||||
We can leverage the `bitsandbytes` library to load and quantize the T5-XXL text encoder to 8-bit precision. This allows you to keep using all three text encoders while only slightly impacting performance.
|
||||
|
||||
First install the `bitsandbytes` library.
|
||||
|
||||
```shell
|
||||
pip install bitsandbytes
|
||||
```
|
||||
|
||||
Then load the T5-XXL model using the `BitsAndBytesConfig`.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusion3Pipeline
|
||||
from transformers import T5EncoderModel, BitsAndBytesConfig
|
||||
|
||||
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
||||
|
||||
model_id = "stabilityai/stable-diffusion-3-medium-diffusers"
|
||||
text_encoder = T5EncoderModel.from_pretrained(
|
||||
model_id,
|
||||
subfolder="text_encoder_3",
|
||||
quantization_config=quantization_config,
|
||||
)
|
||||
pipe = StableDiffusion3Pipeline.from_pretrained(
|
||||
model_id,
|
||||
text_encoder_3=text_encoder,
|
||||
device_map="balanced",
|
||||
torch_dtype=torch.float16
|
||||
)
|
||||
|
||||
image = pipe(
|
||||
prompt="a photo of a cat holding a sign that says hello world",
|
||||
negative_prompt="",
|
||||
num_inference_steps=28,
|
||||
height=1024,
|
||||
width=1024,
|
||||
guidance_scale=7.0,
|
||||
).images[0]
|
||||
|
||||
image.save("sd3_hello_world-8bit-T5.png")
|
||||
```
|
||||
|
||||
You can find the end-to-end script [here](https://gist.github.com/sayakpaul/82acb5976509851f2db1a83456e504f1).
|
||||
|
||||
## Performance Optimizations for SD3
|
||||
|
||||
### Using Torch Compile to Speed Up Inference
|
||||
|
||||
Using compiled components in the SD3 pipeline can speed up inference by as much as 4X. The following code snippet demonstrates how to compile the Transformer and VAE components of the SD3 pipeline.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusion3Pipeline
|
||||
|
||||
torch.set_float32_matmul_precision("high")
|
||||
|
||||
torch._inductor.config.conv_1x1_as_mm = True
|
||||
torch._inductor.config.coordinate_descent_tuning = True
|
||||
torch._inductor.config.epilogue_fusion = False
|
||||
torch._inductor.config.coordinate_descent_check_all_directions = True
|
||||
|
||||
pipe = StableDiffusion3Pipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-3-medium-diffusers",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipe.set_progress_bar_config(disable=True)
|
||||
|
||||
pipe.transformer.to(memory_format=torch.channels_last)
|
||||
pipe.vae.to(memory_format=torch.channels_last)
|
||||
|
||||
pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
|
||||
pipe.vae.decode = torch.compile(pipe.vae.decode, mode="max-autotune", fullgraph=True)
|
||||
|
||||
# Warm Up
|
||||
prompt = "a photo of a cat holding a sign that says hello world"
|
||||
for _ in range(3):
|
||||
_ = pipe(prompt=prompt, generator=torch.manual_seed(1))
|
||||
|
||||
# Run Inference
|
||||
image = pipe(prompt=prompt, generator=torch.manual_seed(1)).images[0]
|
||||
image.save("sd3_hello_world.png")
|
||||
```
|
||||
|
||||
Check out the full script [here](https://gist.github.com/sayakpaul/508d89d7aad4f454900813da5d42ca97).
|
||||
|
||||
## Loading the original checkpoints via `from_single_file`
|
||||
|
||||
The `SD3Transformer2DModel` and `StableDiffusion3Pipeline` classes support loading the original checkpoints via the `from_single_file` method. This method allows you to load the original checkpoint files that were used to train the models.
|
||||
|
||||
## Loading the original checkpoints for the `SD3Transformer2DModel`
|
||||
|
||||
```python
|
||||
from diffusers import SD3Transformer2DModel
|
||||
|
||||
model = SD3Transformer2DModel.from_single_file("https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/sd3_medium.safetensors")
|
||||
```
|
||||
|
||||
## Loading the single checkpoint for the `StableDiffusion3Pipeline`
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusion3Pipeline
|
||||
from transformers import T5EncoderModel
|
||||
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", subfolder="text_encoder_3", torch_dtype=torch.float16)
|
||||
pipe = StableDiffusion3Pipeline.from_single_file("https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/sd3_medium_incl_clips.safetensors", torch_dtype=torch.float16, text_encoder_3=text_encoder_3)
|
||||
```
|
||||
|
||||
<Tip>
|
||||
`from_single_file` support for the `fp8` version of the checkpoints is coming soon. Watch this space.
|
||||
</Tip>
|
||||
|
||||
## StableDiffusion3Pipeline
|
||||
|
||||
[[autodoc]] StableDiffusion3Pipeline
|
||||
- all
|
||||
- __call__
|
||||
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# EDMDPMSolverMultistepScheduler
|
||||
|
||||
`EDMDPMSolverMultistepScheduler` is a [Karras formulation](https://huggingface.co/papers/2206.00364) of `DPMSolverMultistep`, a multistep scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
|
||||
`EDMDPMSolverMultistepScheduler` is a [Karras formulation](https://huggingface.co/papers/2206.00364) of `DPMSolverMultistepScheduler`, a multistep scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
|
||||
|
||||
DPMSolver (and the improved version DPMSolver++) is a fast dedicated high-order solver for diffusion ODEs with convergence order guarantee. Empirically, DPMSolver sampling with only 20 steps can generate high-quality
|
||||
samples, and it can generate quite good samples even in 10 steps.
|
||||
|
||||
18
docs/source/en/api/schedulers/flow_match_euler_discrete.md
Normal file
18
docs/source/en/api/schedulers/flow_match_euler_discrete.md
Normal file
@@ -0,0 +1,18 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# FlowMatchEulerDiscreteScheduler
|
||||
|
||||
`FlowMatchEulerDiscreteScheduler` is based on the flow-matching sampling introduced in [Stable Diffusion 3](https://arxiv.org/abs/2403.03206).
|
||||
|
||||
## FlowMatchEulerDiscreteScheduler
|
||||
[[autodoc]] FlowMatchEulerDiscreteScheduler
|
||||
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# DPMSolverMultistepScheduler
|
||||
|
||||
`DPMSolverMultistep` is a multistep scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
|
||||
`DPMSolverMultistepScheduler` is a multistep scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
|
||||
|
||||
DPMSolver (and the improved version DPMSolver++) is a fast dedicated high-order solver for diffusion ODEs with convergence order guarantee. Empirically, DPMSolver sampling with only 20 steps can generate high-quality
|
||||
samples, and it can generate quite good samples even in 10 steps.
|
||||
|
||||
@@ -12,4 +12,10 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Video Processor
|
||||
|
||||
The `VideoProcessor` provides a unified API for video pipelines to prepare inputs for VAE encoding and post-processing outputs once they're decoded. The class inherits [`VaeImageProcessor`] so it includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays.
|
||||
The [`VideoProcessor`] provides a unified API for video pipelines to prepare inputs for VAE encoding and post-processing outputs once they're decoded. The class inherits [`VaeImageProcessor`] so it includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays.
|
||||
|
||||
## VideoProcessor
|
||||
|
||||
[[autodoc]] video_processor.VideoProcessor.preprocess_video
|
||||
|
||||
[[autodoc]] video_processor.VideoProcessor.postprocess_video
|
||||
|
||||
@@ -70,7 +70,7 @@ The following design principles are followed:
|
||||
- Pipelines should be used **only** for inference.
|
||||
- Pipelines should be very readable, self-explanatory, and easy to tweak.
|
||||
- Pipelines should be designed to build on top of each other and be easy to integrate into higher-level APIs.
|
||||
- Pipelines are **not** intended to be feature-complete user interfaces. For future complete user interfaces one should rather have a look at [InvokeAI](https://github.com/invoke-ai/InvokeAI), [Diffuzers](https://github.com/abhishekkrthakur/diffuzers), and [lama-cleaner](https://github.com/Sanster/lama-cleaner).
|
||||
- Pipelines are **not** intended to be feature-complete user interfaces. For feature-complete user interfaces one should rather have a look at [InvokeAI](https://github.com/invoke-ai/InvokeAI), [Diffuzers](https://github.com/abhishekkrthakur/diffuzers), and [lama-cleaner](https://github.com/Sanster/lama-cleaner).
|
||||
- Every pipeline should have one and only one way to run it via a `__call__` method. The naming of the `__call__` arguments should be shared across all pipelines.
|
||||
- Pipelines should be named after the task they are intended to solve.
|
||||
- In almost all cases, novel diffusion pipelines shall be implemented in a new pipeline folder/file.
|
||||
|
||||
@@ -36,7 +36,7 @@ Then load and enable the [`DeepCacheSDHelper`](https://github.com/horseee/DeepCa
|
||||
image = pipe("a photo of an astronaut on a moon").images[0]
|
||||
```
|
||||
|
||||
The `set_params` method accepts two arguments: `cache_interval` and `cache_branch_id`. `cache_interval` means the frequency of feature caching, specified as the number of steps between each cache operation. `cache_branch_id` identifies which branch of the network (ordered from the shallowest to the deepest layer) is responsible for executing the caching processes.
|
||||
The `set_params` method accepts two arguments: `cache_interval` and `cache_branch_id`. `cache_interval` means the frequency of feature caching, specified as the number of steps between each cache operation. `cache_branch_id` identifies which branch of the network (ordered from the shallowest to the deepest layer) is responsible for executing the caching processes.
|
||||
Opting for a lower `cache_branch_id` or a larger `cache_interval` can lead to faster inference speed at the expense of reduced image quality (ablation experiments of these two hyperparameters can be found in the [paper](https://arxiv.org/abs/2312.00858)). Once those arguments are set, use the `enable` or `disable` methods to activate or deactivate the `DeepCacheSDHelper`.
|
||||
|
||||
<div class="flex justify-center">
|
||||
|
||||
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Speed up inference
|
||||
|
||||
There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attetntion](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.
|
||||
There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.
|
||||
|
||||
> [!TIP]
|
||||
> Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide.
|
||||
|
||||
@@ -6,7 +6,7 @@ Before you begin, make sure you install T-GATE.
|
||||
|
||||
```bash
|
||||
pip install tgate
|
||||
pip install -U pytorch diffusers transformers accelerate DeepCache
|
||||
pip install -U torch diffusers transformers accelerate DeepCache
|
||||
```
|
||||
|
||||
|
||||
@@ -46,12 +46,12 @@ pipe = TgatePixArtLoader(
|
||||
|
||||
image = pipe.tgate(
|
||||
"An alpaca made of colorful building blocks, cyberpunk.",
|
||||
gate_step=gate_step,
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step,
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="Stable Diffusion XL">
|
||||
<hfoption id="Stable Diffusion XL">
|
||||
|
||||
Accelerate `StableDiffusionXLPipeline` with T-GATE:
|
||||
|
||||
@@ -78,9 +78,9 @@ pipe = TgateSDXLLoader(
|
||||
).to("cuda")
|
||||
|
||||
image = pipe.tgate(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
@@ -111,9 +111,9 @@ pipe = TgateSDXLDeepCacheLoader(
|
||||
).to("cuda")
|
||||
|
||||
image = pipe.tgate(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
@@ -151,9 +151,9 @@ pipe = TgateSDXLLoader(
|
||||
).to("cuda")
|
||||
|
||||
image = pipe.tgate(
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
|
||||
gate_step=gate_step,
|
||||
num_inference_steps=inference_step
|
||||
).images[0]
|
||||
```
|
||||
</hfoption>
|
||||
|
||||
@@ -440,6 +440,198 @@ Stable Diffusion XL (SDXL) is a powerful text-to-image model that generates high
|
||||
|
||||
The SDXL training script is discussed in more detail in the [SDXL training](sdxl) guide.
|
||||
|
||||
## DeepFloyd IF
|
||||
|
||||
DeepFloyd IF is a cascading pixel diffusion model with three stages. The first stage generates a base image and the second and third stages progressively upscales the base image into a high-resolution 1024x1024 image. Use the [train_dreambooth_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py) or [train_dreambooth.py](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py) scripts to train a DeepFloyd IF model with LoRA or the full model.
|
||||
|
||||
DeepFloyd IF uses predicted variance, but the Diffusers training scripts uses predicted error so the trained DeepFloyd IF models are switched to a fixed variance schedule. The training scripts will update the scheduler config of the fully trained model for you. However, when you load the saved LoRA weights you must also update the pipeline's scheduler config.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", use_safetensors=True)
|
||||
|
||||
pipe.load_lora_weights("<lora weights path>")
|
||||
|
||||
# Update scheduler config to fixed variance schedule
|
||||
pipe.scheduler = pipe.scheduler.__class__.from_config(pipe.scheduler.config, variance_type="fixed_small")
|
||||
```
|
||||
|
||||
The stage 2 model requires additional validation images to upscale. You can download and use a downsized version of the training images for this.
|
||||
|
||||
```py
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
local_dir = "./dog_downsized"
|
||||
snapshot_download(
|
||||
"diffusers/dog-example-downsized",
|
||||
local_dir=local_dir,
|
||||
repo_type="dataset",
|
||||
ignore_patterns=".gitattributes",
|
||||
)
|
||||
```
|
||||
|
||||
The code samples below provide a brief overview of how to train a DeepFloyd IF model with a combination of DreamBooth and LoRA. Some important parameters to note are:
|
||||
|
||||
* `--resolution=64`, a much smaller resolution is required because DeepFloyd IF is a pixel diffusion model and to work on uncompressed pixels, the input images must be smaller
|
||||
* `--pre_compute_text_embeddings`, compute the text embeddings ahead of time to save memory because the [`~transformers.T5Model`] can take up a lot of memory
|
||||
* `--tokenizer_max_length=77`, you can use a longer default text length with T5 as the text encoder but the default model encoding procedure uses a shorter text length
|
||||
* `--text_encoder_use_attention_mask`, to pass the attention mask to the text encoder
|
||||
|
||||
<hfoptions id="IF-DreamBooth">
|
||||
<hfoption id="Stage 1 LoRA DreamBooth">
|
||||
|
||||
Training stage 1 of DeepFloyd IF with LoRA and DreamBooth requires ~28GB of memory.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="DeepFloyd/IF-I-XL-v1.0"
|
||||
export INSTANCE_DIR="dog"
|
||||
export OUTPUT_DIR="dreambooth_dog_lora"
|
||||
|
||||
accelerate launch train_dreambooth_lora.py \
|
||||
--report_to wandb \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--instance_prompt="a sks dog" \
|
||||
--resolution=64 \
|
||||
--train_batch_size=4 \
|
||||
--gradient_accumulation_steps=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--scale_lr \
|
||||
--max_train_steps=1200 \
|
||||
--validation_prompt="a sks dog" \
|
||||
--validation_epochs=25 \
|
||||
--checkpointing_steps=100 \
|
||||
--pre_compute_text_embeddings \
|
||||
--tokenizer_max_length=77 \
|
||||
--text_encoder_use_attention_mask
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Stage 2 LoRA DreamBooth">
|
||||
|
||||
For stage 2 of DeepFloyd IF with LoRA and DreamBooth, pay attention to these parameters:
|
||||
|
||||
* `--validation_images`, the images to upscale during validation
|
||||
* `--class_labels_conditioning=timesteps`, to additionally conditional the UNet as needed in stage 2
|
||||
* `--learning_rate=1e-6`, a lower learning rate is used compared to stage 1
|
||||
* `--resolution=256`, the expected resolution for the upscaler
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="DeepFloyd/IF-II-L-v1.0"
|
||||
export INSTANCE_DIR="dog"
|
||||
export OUTPUT_DIR="dreambooth_dog_upscale"
|
||||
export VALIDATION_IMAGES="dog_downsized/image_1.png dog_downsized/image_2.png dog_downsized/image_3.png dog_downsized/image_4.png"
|
||||
|
||||
python train_dreambooth_lora.py \
|
||||
--report_to wandb \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--instance_prompt="a sks dog" \
|
||||
--resolution=256 \
|
||||
--train_batch_size=4 \
|
||||
--gradient_accumulation_steps=1 \
|
||||
--learning_rate=1e-6 \
|
||||
--max_train_steps=2000 \
|
||||
--validation_prompt="a sks dog" \
|
||||
--validation_epochs=100 \
|
||||
--checkpointing_steps=500 \
|
||||
--pre_compute_text_embeddings \
|
||||
--tokenizer_max_length=77 \
|
||||
--text_encoder_use_attention_mask \
|
||||
--validation_images $VALIDATION_IMAGES \
|
||||
--class_labels_conditioning=timesteps
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Stage 1 DreamBooth">
|
||||
|
||||
For stage 1 of DeepFloyd IF with DreamBooth, pay attention to these parameters:
|
||||
|
||||
* `--skip_save_text_encoder`, to skip saving the full T5 text encoder with the finetuned model
|
||||
* `--use_8bit_adam`, to use 8-bit Adam optimizer to save memory due to the size of the optimizer state when training the full model
|
||||
* `--learning_rate=1e-7`, a really low learning rate should be used for full model training otherwise the model quality is degraded (you can use a higher learning rate with a larger batch size)
|
||||
|
||||
Training with 8-bit Adam and a batch size of 4, the full model can be trained with ~48GB of memory.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="DeepFloyd/IF-I-XL-v1.0"
|
||||
export INSTANCE_DIR="dog"
|
||||
export OUTPUT_DIR="dreambooth_if"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=64 \
|
||||
--train_batch_size=4 \
|
||||
--gradient_accumulation_steps=1 \
|
||||
--learning_rate=1e-7 \
|
||||
--max_train_steps=150 \
|
||||
--validation_prompt "a photo of sks dog" \
|
||||
--validation_steps 25 \
|
||||
--text_encoder_use_attention_mask \
|
||||
--tokenizer_max_length 77 \
|
||||
--pre_compute_text_embeddings \
|
||||
--use_8bit_adam \
|
||||
--set_grads_to_none \
|
||||
--skip_save_text_encoder \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Stage 2 DreamBooth">
|
||||
|
||||
For stage 2 of DeepFloyd IF with DreamBooth, pay attention to these parameters:
|
||||
|
||||
* `--learning_rate=5e-6`, use a lower learning rate with a smaller effective batch size
|
||||
* `--resolution=256`, the expected resolution for the upscaler
|
||||
* `--train_batch_size=2` and `--gradient_accumulation_steps=6`, to effectively train on images wiht faces requires larger batch sizes
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="DeepFloyd/IF-II-L-v1.0"
|
||||
export INSTANCE_DIR="dog"
|
||||
export OUTPUT_DIR="dreambooth_dog_upscale"
|
||||
export VALIDATION_IMAGES="dog_downsized/image_1.png dog_downsized/image_2.png dog_downsized/image_3.png dog_downsized/image_4.png"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--report_to wandb \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--instance_prompt="a sks dog" \
|
||||
--resolution=256 \
|
||||
--train_batch_size=2 \
|
||||
--gradient_accumulation_steps=6 \
|
||||
--learning_rate=5e-6 \
|
||||
--max_train_steps=2000 \
|
||||
--validation_prompt="a sks dog" \
|
||||
--validation_steps=150 \
|
||||
--checkpointing_steps=500 \
|
||||
--pre_compute_text_embeddings \
|
||||
--tokenizer_max_length=77 \
|
||||
--text_encoder_use_attention_mask \
|
||||
--validation_images $VALIDATION_IMAGES \
|
||||
--class_labels_conditioning timesteps \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### Training tips
|
||||
|
||||
Training the DeepFloyd IF model can be challenging, but here are some tips that we've found helpful:
|
||||
|
||||
- LoRA is sufficient for training the stage 1 model because the model's low resolution makes representing finer details difficult regardless.
|
||||
- For common or simple objects, you don't necessarily need to finetune the upscaler. Make sure the prompt passed to the upscaler is adjusted to remove the new token from the instance prompt. For example, if your stage 1 prompt is "a sks dog" then your stage 2 prompt should be "a dog".
|
||||
- For finer details like faces, fully training the stage 2 upscaler is better than training the stage 2 model with LoRA. It also helps to use lower learning rates with larger batch sizes.
|
||||
- Lower learning rates should be used to train the stage 2 model.
|
||||
- The [`DDPMScheduler`] works better than the DPMSolver used in the training scripts.
|
||||
|
||||
## Next steps
|
||||
|
||||
Congratulations on training your DreamBooth model! To learn more about how to use your new model, the following guide may be helpful:
|
||||
|
||||
@@ -260,7 +260,7 @@ Then, you'll need a way to evaluate the model. For evaluation, you can use the [
|
||||
... # The default pipeline output type is `List[PIL.Image]`
|
||||
... images = pipeline(
|
||||
... batch_size=config.eval_batch_size,
|
||||
... generator=torch.manual_seed(config.seed),
|
||||
... generator=torch.Generator(device='cpu').manual_seed(config.seed), # Use a separate torch generator to avoid rewinding the random state of the main training loop
|
||||
... ).images
|
||||
|
||||
... # Make a grid out of the images
|
||||
|
||||
@@ -188,7 +188,7 @@ def latents_to_rgb(latents):
|
||||
```py
|
||||
def decode_tensors(pipe, step, timestep, callback_kwargs):
|
||||
latents = callback_kwargs["latents"]
|
||||
|
||||
|
||||
image = latents_to_rgb(latents)
|
||||
image.save(f"{step}.png")
|
||||
|
||||
|
||||
@@ -12,54 +12,10 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Controlling image quality
|
||||
|
||||
The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better image lighting and details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training.
|
||||
The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training.
|
||||
|
||||
This guide will show you how to turn these techniques on in your pipeline and how to configure them to improve the quality of your generated images.
|
||||
|
||||
## Lighting
|
||||
|
||||
The Stable Diffusion models aren't very good at generating images that are very bright or dark because the scheduler doesn't start sampling from the last timestep and it doesn't enforce a zero signal-to-noise ratio (SNR). The [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://hf.co/papers/2305.08891) paper fixes these issues which are now available in some Diffusers schedulers.
|
||||
|
||||
> [!TIP]
|
||||
> For inference, you need a model that has been trained with *v_prediction*. To train your own model with *v_prediction*, add the following flag to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts.
|
||||
>
|
||||
> ```bash
|
||||
> --prediction_type="v_prediction"
|
||||
> ```
|
||||
|
||||
For example, load the [ptx0/pseudo-journey-v2](https://hf.co/ptx0/pseudo-journey-v2) checkpoint which was trained with `v_prediction` and the [`DDIMScheduler`]. Now you should configure the following parameters in the [`DDIMScheduler`].
|
||||
|
||||
* `rescale_betas_zero_snr=True` to rescale the noise schedule to zero SNR
|
||||
* `timestep_spacing="trailing"` to start sampling from the last timestep
|
||||
|
||||
Set `guidance_rescale` in the pipeline to prevent over-exposure. A lower value increases brightness but some of the details may appear washed out.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
|
||||
|
||||
pipeline.scheduler = DDIMScheduler.from_config(
|
||||
pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
prompt = "cinematic photo of a snowy mountain at night with the northern lights aurora borealis overhead, 35mm photograph, film, professional, 4k, highly detailed"
|
||||
generator = torch.Generator(device="cpu").manual_seed(23)
|
||||
image = pipeline(prompt, guidance_rescale=0.7, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/no-zero-snr.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">default Stable Diffusion v2-1 image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero-snr.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">image with zero SNR and trailing timestep spacing enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Details
|
||||
|
||||
[FreeU](https://hf.co/papers/2309.11497) improves image details by rebalancing the UNet's backbone and skip connection weights. The skip connections can cause the model to overlook some of the backbone semantics which may lead to unnatural image details in the generated image. This technique does not require any additional training and can be applied on the fly during inference for tasks like image-to-image and text-to-video.
|
||||
|
||||
@@ -78,7 +78,7 @@ image = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
eta=0.3,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
```
|
||||
@@ -156,14 +156,14 @@ image = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=8,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
eta=0.3,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
```
|
||||
|
||||

|
||||
|
||||
TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method.
|
||||
TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method.
|
||||
|
||||
> [!TIP]
|
||||
> Check out the [Merge LoRAs](merge_loras) guide to learn more about efficient merging methods.
|
||||
@@ -171,7 +171,7 @@ TCD-LoRA also supports other LoRAs trained on different styles. For example, let
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from scheduling_tcd import TCDScheduler
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
@@ -191,7 +191,7 @@ image = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
eta=0.3,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
```
|
||||
@@ -215,7 +215,7 @@ from PIL import Image
|
||||
from transformers import DPTFeatureExtractor, DPTForDepthEstimation
|
||||
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
from scheduling_tcd import TCDScheduler
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
|
||||
@@ -249,13 +249,13 @@ controlnet = ControlNetModel.from_pretrained(
|
||||
controlnet_id,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
)
|
||||
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
base_model_id,
|
||||
controlnet=controlnet,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
@@ -271,9 +271,9 @@ depth_image = get_depth_map(image)
|
||||
controlnet_conditioning_scale = 0.5 # recommended for good generalization
|
||||
|
||||
image = pipe(
|
||||
prompt,
|
||||
image=depth_image,
|
||||
num_inference_steps=4,
|
||||
prompt,
|
||||
image=depth_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
controlnet_conditioning_scale=controlnet_conditioning_scale,
|
||||
@@ -290,7 +290,7 @@ grid_image = make_image_grid([depth_image, image], rows=1, cols=2)
|
||||
import torch
|
||||
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
from scheduling_tcd import TCDScheduler
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
@@ -301,13 +301,13 @@ controlnet = ControlNetModel.from_pretrained(
|
||||
controlnet_id,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
)
|
||||
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
base_model_id,
|
||||
controlnet=controlnet,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
@@ -322,9 +322,9 @@ canny_image = load_image("https://huggingface.co/datasets/hf-internal-testing/di
|
||||
controlnet_conditioning_scale = 0.5 # recommended for good generalization
|
||||
|
||||
image = pipe(
|
||||
prompt,
|
||||
image=canny_image,
|
||||
num_inference_steps=4,
|
||||
prompt,
|
||||
image=canny_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
controlnet_conditioning_scale=controlnet_conditioning_scale,
|
||||
@@ -336,7 +336,7 @@ grid_image = make_image_grid([canny_image, image], rows=1, cols=2)
|
||||

|
||||
|
||||
<Tip>
|
||||
The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one.
|
||||
The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one.
|
||||
</Tip>
|
||||
|
||||
</hfoption>
|
||||
@@ -350,7 +350,7 @@ from diffusers import StableDiffusionXLPipeline
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
from ip_adapter import IPAdapterXL
|
||||
from scheduling_tcd import TCDScheduler
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
@@ -359,8 +359,8 @@ ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
base_model_path,
|
||||
torch_dtype=torch.float16,
|
||||
base_model_path,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16"
|
||||
)
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
@@ -375,13 +375,13 @@ ref_image = load_image("https://raw.githubusercontent.com/tencent-ailab/IP-Adapt
|
||||
prompt = "best quality, high quality, wearing sunglasses"
|
||||
|
||||
image = ip_model.generate(
|
||||
pil_image=ref_image,
|
||||
pil_image=ref_image,
|
||||
prompt=prompt,
|
||||
scale=0.5,
|
||||
num_samples=1,
|
||||
num_inference_steps=4,
|
||||
num_samples=1,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
eta=0.3,
|
||||
seed=0,
|
||||
)[0]
|
||||
|
||||
|
||||
@@ -230,7 +230,7 @@ from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
@@ -255,7 +255,7 @@ from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
@@ -296,7 +296,7 @@ from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
@@ -319,7 +319,7 @@ from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
# remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
|
||||
466
docs/source/en/using-diffusers/marigold_usage.md
Normal file
466
docs/source/en/using-diffusers/marigold_usage.md
Normal file
@@ -0,0 +1,466 @@
|
||||
<!--Copyright 2024 Marigold authors and The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Marigold Pipelines for Computer Vision Tasks
|
||||
|
||||
[Marigold](../api/pipelines/marigold) is a novel diffusion-based dense prediction approach, and a set of pipelines for various computer vision tasks, such as monocular depth estimation.
|
||||
|
||||
This guide will show you how to use Marigold to obtain fast and high-quality predictions for images and videos.
|
||||
|
||||
Each pipeline supports one Computer Vision task, which takes an input RGB image as input and produces a *prediction* of the modality of interest, such as a depth map of the input image.
|
||||
Currently, the following tasks are implemented:
|
||||
|
||||
| Pipeline | Predicted Modalities | Demos |
|
||||
|---------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------:|
|
||||
| [MarigoldDepthPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py) | [Depth](https://en.wikipedia.org/wiki/Depth_map), [Disparity](https://en.wikipedia.org/wiki/Binocular_disparity) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-lcm), [Slow Original Demo (DDIM)](https://huggingface.co/spaces/prs-eth/marigold) |
|
||||
| [MarigoldNormalsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py) | [Surface normals](https://en.wikipedia.org/wiki/Normal_mapping) | [Fast Demo (LCM)](https://huggingface.co/spaces/prs-eth/marigold-normals-lcm) |
|
||||
|
||||
The original checkpoints can be found under the [PRS-ETH](https://huggingface.co/prs-eth/) Hugging Face organization.
|
||||
These checkpoints are meant to work with diffusers pipelines and the [original codebase](https://github.com/prs-eth/marigold).
|
||||
The original code can also be used to train new checkpoints.
|
||||
|
||||
| Checkpoint | Modality | Comment |
|
||||
|-----------------------------------------------------------------------------------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| [prs-eth/marigold-v1-0](https://huggingface.co/prs-eth/marigold-v1-0) | Depth | The first Marigold Depth checkpoint, which predicts *affine-invariant depth* maps. The performance of this checkpoint in benchmarks was studied in the original [paper](https://huggingface.co/papers/2312.02145). Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. Affine-invariant depth prediction has a range of values in each pixel between 0 (near plane) and 1 (far plane); both planes are chosen by the model as part of the inference process. See the `MarigoldImageProcessor` reference for visualization utilities. |
|
||||
| [prs-eth/marigold-depth-lcm-v1-0](https://huggingface.co/prs-eth/marigold-depth-lcm-v1-0) | Depth | The fast Marigold Depth checkpoint, fine-tuned from `prs-eth/marigold-v1-0`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that. |
|
||||
| [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1) | Normals | A preview checkpoint for the Marigold Normals pipeline. Designed to be used with the `DDIMScheduler` at inference, it requires at least 10 steps to get reliable predictions. The surface normals predictions are unit-length 3D vectors with values in the range from -1 to 1. *This checkpoint will be phased out after the release of `v1-0` version.* |
|
||||
| [prs-eth/marigold-normals-lcm-v0-1](https://huggingface.co/prs-eth/marigold-normals-lcm-v0-1) | Normals | The fast Marigold Normals checkpoint, fine-tuned from `prs-eth/marigold-normals-v0-1`. Designed to be used with the `LCMScheduler` at inference, it requires as little as 1 step to get reliable predictions. The prediction reliability saturates at 4 steps and declines after that. *This checkpoint will be phased out after the release of `v1-0` version.* |
|
||||
The examples below are mostly given for depth prediction, but they can be universally applied with other supported modalities.
|
||||
We showcase the predictions using the same input image of Albert Einstein generated by Midjourney.
|
||||
This makes it easier to compare visualizations of the predictions across various modalities and checkpoints.
|
||||
|
||||
<div class="flex gap-4" style="justify-content: center; width: 100%;">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://marigoldmonodepth.github.io/images/einstein.jpg"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Example input image for all Marigold pipelines
|
||||
</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
### Depth Prediction Quick Start
|
||||
|
||||
To get the first depth prediction, load `prs-eth/marigold-depth-lcm-v1-0` checkpoint into `MarigoldDepthPipeline` pipeline, put the image through the pipeline, and save the predictions:
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
depth = pipe(image)
|
||||
|
||||
vis = pipe.image_processor.visualize_depth(depth.prediction)
|
||||
vis[0].save("einstein_depth.png")
|
||||
|
||||
depth_16bit = pipe.image_processor.export_depth_to_16bit_png(depth.prediction)
|
||||
depth_16bit[0].save("einstein_depth_16bit.png")
|
||||
```
|
||||
|
||||
The visualization function for depth [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_depth`] applies one of [matplotlib's colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html) (`Spectral` by default) to map the predicted pixel values from a single-channel `[0, 1]` depth range into an RGB image.
|
||||
With the `Spectral` colormap, pixels with near depth are painted red, and far pixels are assigned blue color.
|
||||
The 16-bit PNG file stores the single channel values mapped linearly from the `[0, 1]` range into `[0, 65535]`.
|
||||
Below are the raw and the visualized predictions; as can be seen, dark areas (mustache) are easier to distinguish in the visualization:
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_lcm_depth_16bit.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Predicted depth (16-bit PNG)
|
||||
</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_lcm_depth.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Predicted depth visualization (Spectral)
|
||||
</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
### Surface Normals Prediction Quick Start
|
||||
|
||||
Load `prs-eth/marigold-normals-lcm-v0-1` checkpoint into `MarigoldNormalsPipeline` pipeline, put the image through the pipeline, and save the predictions:
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
|
||||
"prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
normals = pipe(image)
|
||||
|
||||
vis = pipe.image_processor.visualize_normals(normals.prediction)
|
||||
vis[0].save("einstein_normals.png")
|
||||
```
|
||||
|
||||
The visualization function for normals [`~pipelines.marigold.marigold_image_processing.MarigoldImageProcessor.visualize_normals`] maps the three-dimensional prediction with pixel values in the range `[-1, 1]` into an RGB image.
|
||||
The visualization function supports flipping surface normals axes to make the visualization compatible with other choices of the frame of reference.
|
||||
Conceptually, each pixel is painted according to the surface normal vector in the frame of reference, where `X` axis points right, `Y` axis points up, and `Z` axis points at the viewer.
|
||||
Below is the visualized prediction:
|
||||
|
||||
<div class="flex gap-4" style="justify-content: center; width: 100%;">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_lcm_normals.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Predicted surface normals visualization
|
||||
</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
In this example, the nose tip almost certainly has a point on the surface, in which the surface normal vector points straight at the viewer, meaning that its coordinates are `[0, 0, 1]`.
|
||||
This vector maps to the RGB `[128, 128, 255]`, which corresponds to the violet-blue color.
|
||||
Similarly, a surface normal on the cheek in the right part of the image has a large `X` component, which increases the red hue.
|
||||
Points on the shoulders pointing up with a large `Y` promote green color.
|
||||
|
||||
### Speeding up inference
|
||||
|
||||
The above quick start snippets are already optimized for speed: they load the LCM checkpoint, use the `fp16` variant of weights and computation, and perform just one denoising diffusion step.
|
||||
The `pipe(image)` call completes in 280ms on RTX 3090 GPU.
|
||||
Internally, the input image is encoded with the Stable Diffusion VAE encoder, then the U-Net performs one denoising step, and finally, the prediction latent is decoded with the VAE decoder into pixel space.
|
||||
In this case, two out of three module calls are dedicated to converting between pixel and latent space of LDM.
|
||||
Because Marigold's latent space is compatible with the base Stable Diffusion, it is possible to speed up the pipeline call by more than 3x (85ms on RTX 3090) by using a [lightweight replacement of the SD VAE](../api/models/autoencoder_tiny):
|
||||
|
||||
```diff
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
+ pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
|
||||
+ "madebyollin/taesd", torch_dtype=torch.float16
|
||||
+ ).cuda()
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
depth = pipe(image)
|
||||
```
|
||||
|
||||
As suggested in [Optimizations](../optimization/torch2.0#torch.compile), adding `torch.compile` may squeeze extra performance depending on the target hardware:
|
||||
|
||||
```diff
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
depth = pipe(image)
|
||||
```
|
||||
|
||||
## Qualitative Comparison with Depth Anything
|
||||
|
||||
With the above speed optimizations, Marigold delivers predictions with more details and faster than [Depth Anything](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything) with the largest checkpoint [LiheYoung/depth-anything-large-hf](https://huggingface.co/LiheYoung/depth-anything-large-hf):
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_lcm_depth.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Marigold LCM fp16 with Tiny AutoEncoder
|
||||
</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/einstein_depthanything_large.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Depth Anything Large
|
||||
</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Maximizing Precision and Ensembling
|
||||
|
||||
Marigold pipelines have a built-in ensembling mechanism combining multiple predictions from different random latents.
|
||||
This is a brute-force way of improving the precision of predictions, capitalizing on the generative nature of diffusion.
|
||||
The ensembling path is activated automatically when the `ensemble_size` argument is set greater than `1`.
|
||||
When aiming for maximum precision, it makes sense to adjust `num_inference_steps` simultaneously with `ensemble_size`.
|
||||
The recommended values vary across checkpoints but primarily depend on the scheduler type.
|
||||
The effect of ensembling is particularly well-seen with surface normals:
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
|
||||
model_path = "prs-eth/marigold-normals-v1-0"
|
||||
|
||||
model_paper_kwargs = {
|
||||
diffusers.schedulers.DDIMScheduler: {
|
||||
"num_inference_steps": 10,
|
||||
"ensemble_size": 10,
|
||||
},
|
||||
diffusers.schedulers.LCMScheduler: {
|
||||
"num_inference_steps": 4,
|
||||
"ensemble_size": 5,
|
||||
},
|
||||
}
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(model_path).to("cuda")
|
||||
pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
|
||||
|
||||
depth = pipe(image, **pipe_kwargs)
|
||||
|
||||
vis = pipe.image_processor.visualize_normals(depth.prediction)
|
||||
vis[0].save("einstein_normals.png")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_lcm_normals.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Surface normals, no ensembling
|
||||
</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_normals.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Surface normals, with ensembling
|
||||
</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
As can be seen, all areas with fine-grained structurers, such as hair, got more conservative and on average more correct predictions.
|
||||
Such a result is more suitable for precision-sensitive downstream tasks, such as 3D reconstruction.
|
||||
|
||||
## Quantitative Evaluation
|
||||
|
||||
To evaluate Marigold quantitatively in standard leaderboards and benchmarks (such as NYU, KITTI, and other datasets), follow the evaluation protocol outlined in the paper: load the full precision fp32 model and use appropriate values for `num_inference_steps` and `ensemble_size`.
|
||||
Optionally seed randomness to ensure reproducibility. Maximizing `batch_size` will deliver maximum device utilization.
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
device = "cuda"
|
||||
seed = 2024
|
||||
model_path = "prs-eth/marigold-v1-0"
|
||||
|
||||
model_paper_kwargs = {
|
||||
diffusers.schedulers.DDIMScheduler: {
|
||||
"num_inference_steps": 50,
|
||||
"ensemble_size": 10,
|
||||
},
|
||||
diffusers.schedulers.LCMScheduler: {
|
||||
"num_inference_steps": 4,
|
||||
"ensemble_size": 10,
|
||||
},
|
||||
}
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(model_path).to(device)
|
||||
pipe_kwargs = model_paper_kwargs[type(pipe.scheduler)]
|
||||
|
||||
depth = pipe(image, generator=generator, **pipe_kwargs)
|
||||
|
||||
# evaluate metrics
|
||||
```
|
||||
|
||||
## Using Predictive Uncertainty
|
||||
|
||||
The ensembling mechanism built into Marigold pipelines combines multiple predictions obtained from different random latents.
|
||||
As a side effect, it can be used to quantify epistemic (model) uncertainty; simply specify `ensemble_size` greater than 1 and set `output_uncertainty=True`.
|
||||
The resulting uncertainty will be available in the `uncertainty` field of the output.
|
||||
It can be visualized as follows:
|
||||
|
||||
```python
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
|
||||
depth = pipe(
|
||||
image,
|
||||
ensemble_size=10, # any number greater than 1; higher values yield higher precision
|
||||
output_uncertainty=True,
|
||||
)
|
||||
|
||||
uncertainty = pipe.image_processor.visualize_uncertainty(depth.uncertainty)
|
||||
uncertainty[0].save("einstein_depth_uncertainty.png")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_depth_uncertainty.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Depth uncertainty
|
||||
</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_normals_uncertainty.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Surface normals uncertainty
|
||||
</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
The interpretation of uncertainty is easy: higher values (white) correspond to pixels, where the model struggles to make consistent predictions.
|
||||
Evidently, the depth model is the least confident around edges with discontinuity, where the object depth changes drastically.
|
||||
The surface normals model is the least confident in fine-grained structures, such as hair, and dark areas, such as the collar.
|
||||
|
||||
## Frame-by-frame Video Processing with Temporal Consistency
|
||||
|
||||
Due to Marigold's generative nature, each prediction is unique and defined by the random noise sampled for the latent initialization.
|
||||
This becomes an obvious drawback compared to traditional end-to-end dense regression networks, as exemplified in the following videos:
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_obama.gif"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">Input video</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_obama_depth_independent.gif"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">Marigold Depth applied to input video frames independently</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
To address this issue, it is possible to pass `latents` argument to the pipelines, which defines the starting point of diffusion.
|
||||
Empirically, we found that a convex combination of the very same starting point noise latent and the latent corresponding to the previous frame prediction give sufficiently smooth results, as implemented in the snippet below:
|
||||
|
||||
```python
|
||||
import imageio
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
import diffusers
|
||||
import torch
|
||||
|
||||
device = "cuda"
|
||||
path_in = "obama.mp4"
|
||||
path_out = "obama_depth.gif"
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
|
||||
).to(device)
|
||||
pipe.vae = diffusers.AutoencoderTiny.from_pretrained(
|
||||
"madebyollin/taesd", torch_dtype=torch.float16
|
||||
).to(device)
|
||||
pipe.set_progress_bar_config(disable=True)
|
||||
|
||||
with imageio.get_reader(path_in) as reader:
|
||||
size = reader.get_meta_data()['size']
|
||||
last_frame_latent = None
|
||||
latent_common = torch.randn(
|
||||
(1, 4, 768 * size[1] // (8 * max(size)), 768 * size[0] // (8 * max(size)))
|
||||
).to(device=device, dtype=torch.float16)
|
||||
|
||||
out = []
|
||||
for frame_id, frame in tqdm(enumerate(reader), desc="Processing Video"):
|
||||
frame = Image.fromarray(frame)
|
||||
latents = latent_common
|
||||
if last_frame_latent is not None:
|
||||
latents = 0.9 * latents + 0.1 * last_frame_latent
|
||||
|
||||
depth = pipe(
|
||||
frame, match_input_resolution=False, latents=latents, output_latent=True
|
||||
)
|
||||
last_frame_latent = depth.latent
|
||||
out.append(pipe.image_processor.visualize_depth(depth.prediction)[0])
|
||||
|
||||
diffusers.utils.export_to_gif(out, path_out, fps=reader.get_meta_data()['fps'])
|
||||
```
|
||||
|
||||
Here, the diffusion process starts from the given computed latent.
|
||||
The pipeline sets `output_latent=True` to access `out.latent` and computes its contribution to the next frame's latent initialization.
|
||||
The result is much more stable now:
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_obama_depth_independent.gif"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">Marigold Depth applied to input video frames independently</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 50%; max-width: 50%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_obama_depth_consistent.gif"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">Marigold Depth with forced latents initialization</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Marigold for ControlNet
|
||||
|
||||
A very common application for depth prediction with diffusion models comes in conjunction with ControlNet.
|
||||
Depth crispness plays a crucial role in obtaining high-quality results from ControlNet.
|
||||
As seen in comparisons with other methods above, Marigold excels at that task.
|
||||
The snippet below demonstrates how to load an image, compute depth, and pass it into ControlNet in a compatible format:
|
||||
|
||||
```python
|
||||
import torch
|
||||
import diffusers
|
||||
|
||||
device = "cuda"
|
||||
generator = torch.Generator(device=device).manual_seed(2024)
|
||||
image = diffusers.utils.load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_depth_source.png"
|
||||
)
|
||||
|
||||
pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
|
||||
"prs-eth/marigold-depth-lcm-v1-0", torch_dtype=torch.float16, variant="fp16"
|
||||
).to(device)
|
||||
|
||||
depth_image = pipe(image, generator=generator).prediction
|
||||
depth_image = pipe.image_processor.visualize_depth(depth_image, color_map="binary")
|
||||
depth_image[0].save("motorcycle_controlnet_depth.png")
|
||||
|
||||
controlnet = diffusers.ControlNetModel.from_pretrained(
|
||||
"diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16"
|
||||
).to(device)
|
||||
pipe = diffusers.StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
"SG161222/RealVisXL_V4.0", torch_dtype=torch.float16, variant="fp16", controlnet=controlnet
|
||||
).to(device)
|
||||
pipe.scheduler = diffusers.DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True)
|
||||
|
||||
controlnet_out = pipe(
|
||||
prompt="high quality photo of a sports bike, city",
|
||||
negative_prompt="",
|
||||
guidance_scale=6.5,
|
||||
num_inference_steps=25,
|
||||
image=depth_image,
|
||||
controlnet_conditioning_scale=0.7,
|
||||
control_guidance_end=0.7,
|
||||
generator=generator,
|
||||
).images
|
||||
controlnet_out[0].save("motorcycle_controlnet_out.png")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div style="flex: 1 1 33%; max-width: 33%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_depth_source.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Input image
|
||||
</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 33%; max-width: 33%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/motorcycle_controlnet_depth.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
Depth in the format compatible with ControlNet
|
||||
</figcaption>
|
||||
</div>
|
||||
<div style="flex: 1 1 33%; max-width: 33%;">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/motorcycle_controlnet_out.png"/>
|
||||
<figcaption class="mt-1 text-center text-sm text-gray-500">
|
||||
ControlNet generation, conditioned on depth and prompt: "high quality photo of a sports bike, city"
|
||||
</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Hopefully, you will find Marigold useful for solving your downstream tasks, be it a part of a more broad generative workflow, or a perception task, such as 3D reconstruction.
|
||||
@@ -10,156 +10,86 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Load different Stable Diffusion formats
|
||||
# Model files and layouts
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Stable Diffusion models are available in different formats depending on the framework they're trained and saved with, and where you download them from. Converting these formats for use in 🤗 Diffusers allows you to use all the features supported by the library, such as [using different schedulers](schedulers) for inference, [building your custom pipeline](write_own_pipeline), and a variety of techniques and methods for [optimizing inference speed](../optimization/opt_overview).
|
||||
Diffusion models are saved in various file types and organized in different layouts. Diffusers stores model weights as safetensors files in *Diffusers-multifolder* layout and it also supports loading files (like safetensors and ckpt files) from a *single-file* layout which is commonly used in the diffusion ecosystem.
|
||||
|
||||
<Tip>
|
||||
Each layout has its own benefits and use cases, and this guide will show you how to load the different files and layouts, and how to convert them.
|
||||
|
||||
We highly recommend using the `.safetensors` format because it is more secure than traditional pickled files which are vulnerable and can be exploited to execute any code on your machine (learn more in the [Load safetensors](using_safetensors) guide).
|
||||
## Files
|
||||
|
||||
</Tip>
|
||||
PyTorch model weights are typically saved with Python's [pickle](https://docs.python.org/3/library/pickle.html) utility as ckpt or bin files. However, pickle is not secure and pickled files may contain malicious code that can be executed. This vulnerability is a serious concern given the popularity of model sharing. To address this security issue, the [Safetensors](https://hf.co/docs/safetensors) library was developed as a secure alternative to pickle, which saves models as safetensors files.
|
||||
|
||||
This guide will show you how to convert other Stable Diffusion formats to be compatible with 🤗 Diffusers.
|
||||
### safetensors
|
||||
|
||||
## PyTorch .ckpt
|
||||
> [!TIP]
|
||||
> Learn more about the design decisions and why safetensor files are preferred for saving and loading model weights in the [Safetensors audited as really safe and becoming the default](https://blog.eleuther.ai/safetensors-security-audit/) blog post.
|
||||
|
||||
The checkpoint - or `.ckpt` - format is commonly used to store and save models. The `.ckpt` file contains the entire model and is typically several GBs in size. While you can load and use a `.ckpt` file directly with the [`~StableDiffusionPipeline.from_single_file`] method, it is generally better to convert the `.ckpt` file to 🤗 Diffusers so both formats are available.
|
||||
[Safetensors](https://hf.co/docs/safetensors) is a safe and fast file format for securely storing and loading tensors. Safetensors restricts the header size to limit certain types of attacks, supports lazy loading (useful for distributed setups), and has generally faster loading speeds.
|
||||
|
||||
There are two options for converting a `.ckpt` file: use a Space to convert the checkpoint or convert the `.ckpt` file with a script.
|
||||
Make sure you have the [Safetensors](https://hf.co/docs/safetensors) library installed.
|
||||
|
||||
### Convert with a Space
|
||||
|
||||
The easiest and most convenient way to convert a `.ckpt` file is to use the [SD to Diffusers](https://huggingface.co/spaces/diffusers/sd-to-diffusers) Space. You can follow the instructions on the Space to convert the `.ckpt` file.
|
||||
|
||||
This approach works well for basic models, but it may struggle with more customized models. You'll know the Space failed if it returns an empty pull request or error. In this case, you can try converting the `.ckpt` file with a script.
|
||||
|
||||
### Convert with a script
|
||||
|
||||
🤗 Diffusers provides a [conversion script](https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py) for converting `.ckpt` files. This approach is more reliable than the Space above.
|
||||
|
||||
Before you start, make sure you have a local clone of 🤗 Diffusers to run the script and log in to your Hugging Face account so you can open pull requests and push your converted model to the Hub.
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```py
|
||||
!pip install safetensors
|
||||
```
|
||||
|
||||
To use the script:
|
||||
Safetensors stores weights in a safetensors file. Diffusers loads safetensors files by default if they're available and the Safetensors library is installed. There are two ways safetensors files can be organized:
|
||||
|
||||
1. Git clone the repository containing the `.ckpt` file you want to convert. For this example, let's convert this [TemporalNet](https://huggingface.co/CiaraRowles/TemporalNet) `.ckpt` file:
|
||||
1. Diffusers-multifolder layout: there may be several separate safetensors files, one for each pipeline component (text encoder, UNet, VAE), organized in subfolders (check out the [runwayml/stable-diffusion-v1-5](https://hf.co/runwayml/stable-diffusion-v1-5/tree/main) repository as an example)
|
||||
2. single-file layout: all the model weights may be saved in a single file (check out the [WarriorMama777/OrangeMixs](https://hf.co/WarriorMama777/OrangeMixs/tree/main/Models/AbyssOrangeMix) repository as an example)
|
||||
|
||||
```bash
|
||||
git lfs install
|
||||
git clone https://huggingface.co/CiaraRowles/TemporalNet
|
||||
```
|
||||
<hfoptions id="safetensors">
|
||||
<hfoption id="multifolder">
|
||||
|
||||
2. Open a pull request on the repository where you're converting the checkpoint from:
|
||||
|
||||
```bash
|
||||
cd TemporalNet && git fetch origin refs/pr/13:pr/13
|
||||
git checkout pr/13
|
||||
```
|
||||
|
||||
3. There are several input arguments to configure in the conversion script, but the most important ones are:
|
||||
|
||||
- `checkpoint_path`: the path to the `.ckpt` file to convert.
|
||||
- `original_config_file`: a YAML file defining the configuration of the original architecture. If you can't find this file, try searching for the YAML file in the GitHub repository where you found the `.ckpt` file.
|
||||
- `dump_path`: the path to the converted model.
|
||||
|
||||
For example, you can take the `cldm_v15.yaml` file from the [ControlNet](https://github.com/lllyasviel/ControlNet/tree/main/models) repository because the TemporalNet model is a Stable Diffusion v1.5 and ControlNet model.
|
||||
|
||||
4. Now you can run the script to convert the `.ckpt` file:
|
||||
|
||||
```bash
|
||||
python ../diffusers/scripts/convert_original_stable_diffusion_to_diffusers.py --checkpoint_path temporalnetv3.ckpt --original_config_file cldm_v15.yaml --dump_path ./ --controlnet
|
||||
```
|
||||
|
||||
5. Once the conversion is done, upload your converted model and test out the resulting [pull request](https://huggingface.co/CiaraRowles/TemporalNet/discussions/13)!
|
||||
|
||||
```bash
|
||||
git push origin pr/13:refs/pr/13
|
||||
```
|
||||
|
||||
## Keras .pb or .h5
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
🧪 This is an experimental feature. Only Stable Diffusion v1 checkpoints are supported by the Convert KerasCV Space at the moment.
|
||||
|
||||
</Tip>
|
||||
|
||||
[KerasCV](https://keras.io/keras_cv/) supports training for [Stable Diffusion](https://github.com/keras-team/keras-cv/blob/master/keras_cv/models/stable_diffusion) v1 and v2. However, it offers limited support for experimenting with Stable Diffusion models for inference and deployment whereas 🤗 Diffusers has a more complete set of features for this purpose, such as different [noise schedulers](https://huggingface.co/docs/diffusers/using-diffusers/schedulers), [flash attention](https://huggingface.co/docs/diffusers/optimization/xformers), and [other
|
||||
optimization techniques](https://huggingface.co/docs/diffusers/optimization/fp16).
|
||||
|
||||
The [Convert KerasCV](https://huggingface.co/spaces/sayakpaul/convert-kerascv-sd-diffusers) Space converts `.pb` or `.h5` files to PyTorch, and then wraps them in a [`StableDiffusionPipeline`] so it is ready for inference. The converted checkpoint is stored in a repository on the Hugging Face Hub.
|
||||
|
||||
For this example, let's convert the [`sayakpaul/textual-inversion-kerasio`](https://huggingface.co/sayakpaul/textual-inversion-kerasio/tree/main) checkpoint which was trained with Textual Inversion. It uses the special token `<my-funny-cat>` to personalize images with cats.
|
||||
|
||||
The Convert KerasCV Space allows you to input the following:
|
||||
|
||||
* Your Hugging Face token.
|
||||
* Paths to download the UNet and text encoder weights from. Depending on how the model was trained, you don't necessarily need to provide the paths to both the UNet and text encoder. For example, Textual Inversion only requires the embeddings from the text encoder and a text-to-image model only requires the UNet weights.
|
||||
* Placeholder token is only applicable for textual inversion models.
|
||||
* The `output_repo_prefix` is the name of the repository where the converted model is stored.
|
||||
|
||||
Click the **Submit** button to automatically convert the KerasCV checkpoint! Once the checkpoint is successfully converted, you'll see a link to the new repository containing the converted checkpoint. Follow the link to the new repository, and you'll see the Convert KerasCV Space generated a model card with an inference widget to try out the converted model.
|
||||
|
||||
If you prefer to run inference with code, click on the **Use in Diffusers** button in the upper right corner of the model card to copy and paste the code snippet:
|
||||
Use the [`~DiffusionPipeline.from_pretrained`] method to load a model with safetensors files stored in multiple folders.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline", use_safetensors=True
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
Then, you can generate an image like:
|
||||
</hfoption>
|
||||
<hfoption id="single file">
|
||||
|
||||
Use the [`~loaders.FromSingleFileMixin.from_single_file`] method to load a model with all the weights stored in a single safetensors file.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline", use_safetensors=True
|
||||
pipeline = StableDiffusionPipeline.from_single_file(
|
||||
"https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors"
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
|
||||
placeholder_token = "<my-funny-cat-token>"
|
||||
prompt = f"two {placeholder_token} getting married, photorealistic, high quality"
|
||||
image = pipeline(prompt, num_inference_steps=50).images[0]
|
||||
```
|
||||
|
||||
## A1111 LoRA files
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
[Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) (A1111) is a popular web UI for Stable Diffusion that supports model sharing platforms like [Civitai](https://civitai.com/). Models trained with the Low-Rank Adaptation (LoRA) technique are especially popular because they're fast to train and have a much smaller file size than a fully finetuned model. 🤗 Diffusers supports loading A1111 LoRA checkpoints with [`~loaders.LoraLoaderMixin.load_lora_weights`]:
|
||||
#### LoRA files
|
||||
|
||||
[LoRA](https://hf.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a lightweight adapter that is fast and easy to train, making them especially popular for generating images in a certain way or style. These adapters are commonly stored in a safetensors file, and are widely popular on model sharing platforms like [civitai](https://civitai.com/).
|
||||
|
||||
LoRAs are loaded into a base model with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
import torch
|
||||
|
||||
# base model
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"Lykon/dreamshaper-xl-1-0", torch_dtype=torch.float16, variant="fp16"
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Download a LoRA checkpoint from Civitai; this example uses the [Blueprintify SD XL 1.0](https://civitai.com/models/150986/blueprintify-sd-xl-10) checkpoint, but feel free to try out any LoRA checkpoint!
|
||||
# download LoRA weights
|
||||
!wget https://civitai.com/api/download/models/168776 -O blueprintify.safetensors
|
||||
|
||||
```py
|
||||
# uncomment to download the safetensor weights
|
||||
#!wget https://civitai.com/api/download/models/168776 -O blueprintify.safetensors
|
||||
```
|
||||
|
||||
Load the LoRA checkpoint into the pipeline with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method:
|
||||
|
||||
```py
|
||||
# load LoRA weights
|
||||
pipeline.load_lora_weights(".", weight_name="blueprintify.safetensors")
|
||||
```
|
||||
|
||||
Now you can use the pipeline to generate images:
|
||||
|
||||
```py
|
||||
prompt = "bl3uprint, a highly detailed blueprint of the empire state building, explaining how to build all parts, many txt, blueprint grid backdrop"
|
||||
negative_prompt = "lowres, cropped, worst quality, low quality, normal quality, artifacts, signature, watermark, username, blurry, more than one bridge, bad architecture"
|
||||
|
||||
@@ -174,3 +104,379 @@ image
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/blueprint-lora.png"/>
|
||||
</div>
|
||||
|
||||
### ckpt
|
||||
|
||||
> [!WARNING]
|
||||
> Pickled files may be unsafe because they can be exploited to execute malicious code. It is recommended to use safetensors files instead where possible, or convert the weights to safetensors files.
|
||||
|
||||
PyTorch's [torch.save](https://pytorch.org/docs/stable/generated/torch.save.html) function uses Python's [pickle](https://docs.python.org/3/library/pickle.html) utility to serialize and save models. These files are saved as a ckpt file and they contain the entire model's weights.
|
||||
|
||||
Use the [`~loaders.FromSingleFileMixin.from_single_file`] method to directly load a ckpt file.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_single_file(
|
||||
"https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.ckpt"
|
||||
)
|
||||
```
|
||||
|
||||
## Storage layout
|
||||
|
||||
There are two ways model files are organized, either in a Diffusers-multifolder layout or in a single-file layout. The Diffusers-multifolder layout is the default, and each component file (text encoder, UNet, VAE) is stored in a separate subfolder. Diffusers also supports loading models from a single-file layout where all the components are bundled together.
|
||||
|
||||
### Diffusers-multifolder
|
||||
|
||||
The Diffusers-multifolder layout is the default storage layout for Diffusers. Each component's (text encoder, UNet, VAE) weights are stored in a separate subfolder. The weights can be stored as safetensors or ckpt files.
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multifolder-layout.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">multifolder layout</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multifolder-unet.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">UNet subfolder</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
To load from Diffusers-multifolder layout, use the [`~DiffusionPipeline.from_pretrained`] method.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Benefits of using the Diffusers-multifolder layout include:
|
||||
|
||||
1. Faster to load each component file individually or in parallel.
|
||||
2. Reduced memory usage because you only load the components you need. For example, models like [SDXL Turbo](https://hf.co/stabilityai/sdxl-turbo), [SDXL Lightning](https://hf.co/ByteDance/SDXL-Lightning), and [Hyper-SD](https://hf.co/ByteDance/Hyper-SD) have the same components except for the UNet. You can reuse their shared components with the [`~DiffusionPipeline.from_pipe`] method without consuming any additional memory (take a look at the [Reuse a pipeline](./loading#reuse-a-pipeline) guide) and only load the UNet. This way, you don't need to download redundant components and unnecessarily use more memory.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
|
||||
|
||||
# download one model
|
||||
sdxl_pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
).to("cuda")
|
||||
|
||||
# switch UNet for another model
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"stabilityai/sdxl-turbo",
|
||||
subfolder="unet",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True
|
||||
)
|
||||
# reuse all the same components in new model except for the UNet
|
||||
turbo_pipeline = StableDiffusionXLPipeline.from_pipe(
|
||||
sdxl_pipeline, unet=unet,
|
||||
).to("cuda")
|
||||
turbo_pipeline.scheduler = EulerDiscreteScheduler.from_config(
|
||||
turbo_pipeline.scheduler.config,
|
||||
timestep+spacing="trailing"
|
||||
)
|
||||
image = turbo_pipeline(
|
||||
"an astronaut riding a unicorn on mars",
|
||||
num_inference_steps=1,
|
||||
guidance_scale=0.0,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
3. Reduced storage requirements because if a component, such as the SDXL [VAE](https://hf.co/madebyollin/sdxl-vae-fp16-fix), is shared across multiple models, you only need to download and store a single copy of it instead of downloading and storing it multiple times. For 10 SDXL models, this can save ~3.5GB of storage. The storage savings is even greater for newer models like PixArt Sigma, where the [text encoder](https://hf.co/PixArt-alpha/PixArt-Sigma-XL-2-1024-MS/tree/main/text_encoder) alone is ~19GB!
|
||||
4. Flexibility to replace a component in the model with a newer or better version.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, AutoencoderKL
|
||||
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
vae=vae,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
5. More visibility and information about a model's components, which are stored in a [config.json](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/unet/config.json) file in each component subfolder.
|
||||
|
||||
### Single-file
|
||||
|
||||
The single-file layout stores all the model weights in a single file. All the model components (text encoder, UNet, VAE) weights are kept together instead of separately in subfolders. This can be a safetensors or ckpt file.
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/single-file-layout.png"/>
|
||||
</div>
|
||||
|
||||
To load from a single-file layout, use the [`~loaders.FromSingleFileMixin.from_single_file`] method.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_single_file(
|
||||
"https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True,
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Benefits of using a single-file layout include:
|
||||
|
||||
1. Easy compatibility with diffusion interfaces such as [ComfyUI](https://github.com/comfyanonymous/ComfyUI) or [Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) which commonly use a single-file layout.
|
||||
2. Easier to manage (download and share) a single file.
|
||||
|
||||
## Convert layout and files
|
||||
|
||||
Diffusers provides many scripts and methods to convert storage layouts and file formats to enable broader support across the diffusion ecosystem.
|
||||
|
||||
Take a look at the [diffusers/scripts](https://github.com/huggingface/diffusers/tree/main/scripts) collection to find a script that fits your conversion needs.
|
||||
|
||||
> [!TIP]
|
||||
> Scripts that have "`to_diffusers`" appended at the end mean they convert a model to the Diffusers-multifolder layout. Each script has their own specific set of arguments for configuring the conversion, so make sure you check what arguments are available!
|
||||
|
||||
For example, to convert a Stable Diffusion XL model stored in Diffusers-multifolder layout to a single-file layout, run the [convert_diffusers_to_original_sdxl.py](https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_sdxl.py) script. Provide the path to the model to convert, and the path to save the converted model to. You can optionally specify whether you want to save the model as a safetensors file and whether to save the model in half-precision.
|
||||
|
||||
```bash
|
||||
python convert_diffusers_to_original_sdxl.py --model_path path/to/model/to/convert --checkpoint_path path/to/save/model/to --use_safetensors
|
||||
```
|
||||
|
||||
You can also save a model to Diffusers-multifolder layout with the [`~DiffusionPipeline.save_pretrained`] method. This creates a directory for you if it doesn't already exist, and it also saves the files as a safetensors file by default.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_single_file(
|
||||
"https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors",
|
||||
)
|
||||
pipeline.save_pretrained()
|
||||
```
|
||||
|
||||
Lastly, there are also Spaces, such as [SD To Diffusers](https://hf.co/spaces/diffusers/sd-to-diffusers) and [SD-XL To Diffusers](https://hf.co/spaces/diffusers/sdxl-to-diffusers), that provide a more user-friendly interface for converting models to Diffusers-multifolder layout. This is the easiest and most convenient option for converting layouts, and it'll open a PR on your model repository with the converted files. However, this option is not as reliable as running a script, and the Space may fail for more complicated models.
|
||||
|
||||
## Single-file layout usage
|
||||
|
||||
Now that you're familiar with the differences between the Diffusers-multifolder and single-file layout, this section shows you how to load models and pipeline components, customize configuration options for loading, and load local files with the [`~loaders.FromSingleFileMixin.from_single_file`] method.
|
||||
|
||||
### Load a pipeline or model
|
||||
|
||||
Pass the file path of the pipeline or model to the [`~loaders.FromSingleFileMixin.from_single_file`] method to load it.
|
||||
|
||||
<hfoptions id="pipeline-model">
|
||||
<hfoption id="pipeline">
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
pipeline = StableDiffusionXLPipeline.from_single_file(ckpt_path)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="model">
|
||||
|
||||
```py
|
||||
from diffusers import StableCascadeUNet
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_lite.safetensors"
|
||||
model = StableCascadeUNet.from_single_file(ckpt_path)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Customize components in the pipeline by passing them directly to the [`~loaders.FromSingleFileMixin.from_single_file`] method. For example, you can use a different scheduler in a pipeline.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLPipeline, DDIMScheduler
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
scheduler = DDIMScheduler()
|
||||
pipeline = StableDiffusionXLPipeline.from_single_file(ckpt_path, scheduler=scheduler)
|
||||
```
|
||||
|
||||
Or you could use a ControlNet model in the pipeline.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
|
||||
|
||||
ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
|
||||
pipeline = StableDiffusionControlNetPipeline.from_single_file(ckpt_path, controlnet=controlnet)
|
||||
```
|
||||
|
||||
### Customize configuration options
|
||||
|
||||
Models have a configuration file that define their attributes like the number of inputs in a UNet. Pipelines configuration options are available in the pipeline's class. For example, if you look at the [`StableDiffusionXLInstructPix2PixPipeline`] class, there is an option to scale the image latents with the `is_cosxl_edit` parameter.
|
||||
|
||||
These configuration files can be found in the models Hub repository or another location from which the configuration file originated (for example, a GitHub repository or locally on your device).
|
||||
|
||||
<hfoptions id="config-file">
|
||||
<hfoption id="Hub configuration file">
|
||||
|
||||
> [!TIP]
|
||||
> The [`~loaders.FromSingleFileMixin.from_single_file`] method automatically maps the checkpoint to the appropriate model repository, but there are cases where it is useful to use the `config` parameter. For example, if the model components in the checkpoint are different from the original checkpoint or if a checkpoint doesn't have the necessary metadata to correctly determine the configuration to use for the pipeline.
|
||||
|
||||
The [`~loaders.FromSingleFileMixin.from_single_file`] method automatically determines the configuration to use from the configuration file in the model repository. You could also explicitly specify the configuration to use by providing the repository id to the `config` parameter.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/segmind/SSD-1B/blob/main/SSD-1B.safetensors"
|
||||
repo_id = "segmind/SSD-1B"
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_single_file(ckpt_path, config=repo_id)
|
||||
```
|
||||
|
||||
The model loads the configuration file for the [UNet](https://huggingface.co/segmind/SSD-1B/blob/main/unet/config.json), [VAE](https://huggingface.co/segmind/SSD-1B/blob/main/vae/config.json), and [text encoder](https://huggingface.co/segmind/SSD-1B/blob/main/text_encoder/config.json) from their respective subfolders in the repository.
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="original configuration file">
|
||||
|
||||
The [`~loaders.FromSingleFileMixin.from_single_file`] method can also load the original configuration file of a pipeline that is stored elsewhere. Pass a local path or URL of the original configuration file to the `original_config` parameter.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
original_config = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_single_file(ckpt_path, original_config=original_config)
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> Diffusers attempts to infer the pipeline components based on the type signatures of the pipeline class when you use `original_config` with `local_files_only=True`, instead of fetching the configuration files from the model repository on the Hub. This prevents backward breaking changes in code that can't connect to the internet to fetch the necessary configuration files.
|
||||
>
|
||||
> This is not as reliable as providing a path to a local model repository with the `config` parameter, and might lead to errors during pipeline configuration. To avoid errors, run the pipeline with `local_files_only=False` once to download the appropriate pipeline configuration files to the local cache.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
While the configuration files specify the pipeline or models default parameters, you can override them by providing the parameters directly to the [`~loaders.FromSingleFileMixin.from_single_file`] method. Any parameter supported by the model or pipeline class can be configured in this way.
|
||||
|
||||
<hfoptions id="override">
|
||||
<hfoption id="pipeline">
|
||||
|
||||
For example, to scale the image latents in [`StableDiffusionXLInstructPix2PixPipeline`] pass the `is_cosxl_edit` parameter.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionXLInstructPix2PixPipeline
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/cosxl/blob/main/cosxl_edit.safetensors"
|
||||
pipeline = StableDiffusionXLInstructPix2PixPipeline.from_single_file(ckpt_path, config="diffusers/sdxl-instructpix2pix-768", is_cosxl_edit=True)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="model">
|
||||
|
||||
For example, to upcast the attention dimensions in a [`UNet2DConditionModel`] pass the `upcast_attention` parameter.
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DConditionModel
|
||||
|
||||
ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
|
||||
model = UNet2DConditionModel.from_single_file(ckpt_path, upcast_attention=True)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### Local files
|
||||
|
||||
In Diffusers>=v0.28.0, the [`~loaders.FromSingleFileMixin.from_single_file`] method attempts to configure a pipeline or model by inferring the model type from the keys in the checkpoint file. The inferred model type is used to determine the appropriate model repository on the Hugging Face Hub to configure the model or pipeline.
|
||||
|
||||
For example, any single file checkpoint based on the Stable Diffusion XL base model will use the [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) model repository to configure the pipeline.
|
||||
|
||||
But if you're working in an environment with restricted internet access, you should download the configuration files with the [`~huggingface_hub.snapshot_download`] function, and the model checkpoint with the [`~huggingface_hub.hf_hub_download`] function. By default, these files are downloaded to the Hugging Face Hub [cache directory](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache), but you can specify a preferred directory to download the files to with the `local_dir` parameter.
|
||||
|
||||
Pass the configuration and checkpoint paths to the [`~loaders.FromSingleFileMixin.from_single_file`] method to load locally.
|
||||
|
||||
<hfoptions id="local">
|
||||
<hfoption id="Hub cache directory">
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
|
||||
my_local_checkpoint_path = hf_hub_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
filename="SSD-1B.safetensors"
|
||||
)
|
||||
|
||||
my_local_config_path = snapshot_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
|
||||
)
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="specific local directory">
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
|
||||
my_local_checkpoint_path = hf_hub_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
filename="SSD-1B.safetensors"
|
||||
local_dir="my_local_checkpoints"
|
||||
)
|
||||
|
||||
my_local_config_path = snapshot_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
|
||||
local_dir="my_local_config"
|
||||
)
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
#### Local files without symlink
|
||||
|
||||
> [!TIP]
|
||||
> In huggingface_hub>=v0.23.0, the `local_dir_use_symlinks` argument isn't necessary for the [`~huggingface_hub.hf_hub_download`] and [`~huggingface_hub.snapshot_download`] functions.
|
||||
|
||||
The [`~loaders.FromSingleFileMixin.from_single_file`] method relies on the [huggingface_hub](https://hf.co/docs/huggingface_hub/index) caching mechanism to fetch and store checkpoints and configuration files for models and pipelines. If you're working with a file system that does not support symlinking, you should download the checkpoint file to a local directory first, and disable symlinking with the `local_dir_use_symlink=False` parameter in the [`~huggingface_hub.hf_hub_download`] function and [`~huggingface_hub.snapshot_download`] functions.
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
|
||||
my_local_checkpoint_path = hf_hub_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
filename="SSD-1B.safetensors"
|
||||
local_dir="my_local_checkpoints",
|
||||
local_dir_use_symlinks=False
|
||||
)
|
||||
print("My local checkpoint: ", my_local_checkpoint_path)
|
||||
|
||||
my_local_config_path = snapshot_download(
|
||||
repo_id="segmind/SSD-1B",
|
||||
allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
|
||||
local_dir_use_symlinks=False,
|
||||
)
|
||||
print("My local config: ", my_local_config_path)
|
||||
|
||||
```
|
||||
|
||||
Then you can pass the local paths to the `pretrained_model_link_or_path` and `config` parameters.
|
||||
|
||||
```python
|
||||
pipeline = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
|
||||
```
|
||||
|
||||
235
docs/source/en/using-diffusers/scheduler_features.md
Normal file
235
docs/source/en/using-diffusers/scheduler_features.md
Normal file
@@ -0,0 +1,235 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Scheduler features
|
||||
|
||||
The scheduler is an important component of any diffusion model because it controls the entire denoising (or sampling) process. There are many types of schedulers, some are optimized for speed and some for quality. With Diffusers, you can modify the scheduler configuration to use custom noise schedules, sigmas, and rescale the noise schedule. Changing these parameters can have profound effects on inference quality and speed.
|
||||
|
||||
This guide will demonstrate how to use these features to improve inference quality.
|
||||
|
||||
> [!TIP]
|
||||
> Diffusers currently only supports the `timesteps` and `sigmas` parameters for a select list of schedulers and pipelines. Feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to extend these parameters to a scheduler and pipeline that does not currently support it!
|
||||
|
||||
## Timestep schedules
|
||||
|
||||
The timestep or noise schedule determines the amount of noise at each sampling step. The scheduler uses this to generate an image with the corresponding amount of noise at each step. The timestep schedule is generated from the scheduler's default configuration, but you can customize the scheduler to use new and optimized sampling schedules that aren't in Diffusers yet.
|
||||
|
||||
For example, [Align Your Steps (AYS)](https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/) is a method for optimizing a sampling schedule to generate a high-quality image in as little as 10 steps. The optimal [10-step schedule](https://github.com/huggingface/diffusers/blob/a7bf77fc284810483f1e60afe34d1d27ad91ce2e/src/diffusers/schedulers/scheduling_utils.py#L51) for Stable Diffusion XL is:
|
||||
|
||||
```py
|
||||
from diffusers.schedulers import AysSchedules
|
||||
|
||||
sampling_schedule = AysSchedules["StableDiffusionXLTimesteps"]
|
||||
print(sampling_schedule)
|
||||
"[999, 845, 730, 587, 443, 310, 193, 116, 53, 13]"
|
||||
```
|
||||
|
||||
You can use the AYS sampling schedule in a pipeline by passing it to the `timesteps` parameter.
|
||||
|
||||
```py
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"SG161222/RealVisXL_V4.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, algorithm_type="sde-dpmsolver++")
|
||||
|
||||
prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
|
||||
generator = torch.Generator(device="cpu").manual_seed(2487854446)
|
||||
image = pipeline(
|
||||
prompt=prompt,
|
||||
negative_prompt="",
|
||||
generator=generator,
|
||||
timesteps=sampling_schedule,
|
||||
).images[0]
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ays.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">AYS timestep schedule 10 steps</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/10.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 10 steps</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/25.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 25 steps</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Timestep spacing
|
||||
|
||||
The way sample steps are selected in the schedule can affect the quality of the generated image, especially with respect to [rescaling the noise schedule](#rescale-noise-schedule), which can enable a model to generate much brighter or darker images. Diffusers provides three timestep spacing methods:
|
||||
|
||||
- `leading` creates evenly spaced steps
|
||||
- `linspace` includes the first and last steps and evenly selects the remaining intermediate steps
|
||||
- `trailing` only includes the last step and evenly selects the remaining intermediate steps starting from the end
|
||||
|
||||
It is recommended to use the `trailing` spacing method because it generates higher quality images with more details when there are fewer sample steps. But the difference in quality is not as obvious for more standard sample step values.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"SG161222/RealVisXL_V4.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, timestep_spacing="trailing")
|
||||
|
||||
prompt = "A cinematic shot of a cute little black cat sitting on a pumpkin at night"
|
||||
generator = torch.Generator(device="cpu").manual_seed(2487854446)
|
||||
image = pipeline(
|
||||
prompt=prompt,
|
||||
negative_prompt="",
|
||||
generator=generator,
|
||||
num_inference_steps=5,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/trailing_spacing.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">trailing spacing after 5 steps</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/leading_spacing.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">leading spacing after 5 steps</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Sigmas
|
||||
|
||||
The `sigmas` parameter is the amount of noise added at each timestep according to the timestep schedule. Like the `timesteps` parameter, you can customize the `sigmas` parameter to control how much noise is added at each step. When you use a custom `sigmas` value, the `timesteps` are calculated from the custom `sigmas` value and the default scheduler configuration is ignored.
|
||||
|
||||
For example, you can manually pass the [sigmas](https://github.com/huggingface/diffusers/blob/6529ee67ec02fcf58d2fd9242164ea002b351d75/src/diffusers/schedulers/scheduling_utils.py#L55) for something like the 10-step AYS schedule from before to the pipeline.
|
||||
|
||||
```py
|
||||
import torch
|
||||
|
||||
from diffusers import DiffusionPipeline, EulerDiscreteScheduler
|
||||
|
||||
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
|
||||
|
||||
sigmas = [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.0]
|
||||
prompt = "anthropomorphic capybara wearing a suit and working with a computer"
|
||||
generator = torch.Generator(device='cuda').manual_seed(123)
|
||||
image = pipeline(
|
||||
prompt=prompt,
|
||||
num_inference_steps=10,
|
||||
sigmas=sigmas,
|
||||
generator=generator
|
||||
).images[0]
|
||||
```
|
||||
|
||||
When you take a look at the scheduler's `timesteps` parameter, you'll see that it is the same as the AYS timestep schedule because the `timestep` schedule is calculated from the `sigmas`.
|
||||
|
||||
```py
|
||||
print(f" timesteps: {pipe.scheduler.timesteps}")
|
||||
"timesteps: tensor([999., 845., 730., 587., 443., 310., 193., 116., 53., 13.], device='cuda:0')"
|
||||
```
|
||||
|
||||
### Karras sigmas
|
||||
|
||||
> [!TIP]
|
||||
> Refer to the scheduler API [overview](../api/schedulers/overview) for a list of schedulers that support Karras sigmas.
|
||||
>
|
||||
> Karras sigmas should not be used for models that weren't trained with them. For example, the base Stable Diffusion XL model shouldn't use Karras sigmas but the [DreamShaperXL](https://hf.co/Lykon/dreamshaper-xl-1-0) model can since they are trained with Karras sigmas.
|
||||
|
||||
Karras scheduler's use the timestep schedule and sigmas from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://hf.co/papers/2206.00364) paper. This scheduler variant applies a smaller amount of noise per step as it approaches the end of the sampling process compared to other schedulers, and can increase the level of details in the generated image.
|
||||
|
||||
Enable Karras sigmas by setting `use_karras_sigmas=True` in the scheduler.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"SG161222/RealVisXL_V4.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True)
|
||||
|
||||
prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
|
||||
generator = torch.Generator(device="cpu").manual_seed(2487854446)
|
||||
image = pipeline(
|
||||
prompt=prompt,
|
||||
negative_prompt="",
|
||||
generator=generator,
|
||||
).images[0]
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/karras_sigmas_true.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Karras sigmas enabled</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/karras_sigmas_false.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Karras sigmas disabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Rescale noise schedule
|
||||
|
||||
In the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://hf.co/papers/2305.08891) paper, the authors discovered that common noise schedules allowed some signal to leak into the last timestep. This signal leakage at inference can cause models to only generate images with medium brightness. By enforcing a zero signal-to-noise ratio (SNR) for the timstep schedule and sampling from the last timestep, the model can be improved to generate very bright or dark images.
|
||||
|
||||
> [!TIP]
|
||||
> For inference, you need a model that has been trained with *v_prediction*. To train your own model with *v_prediction*, add the following flag to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts.
|
||||
>
|
||||
> ```bash
|
||||
> --prediction_type="v_prediction"
|
||||
> ```
|
||||
|
||||
For example, load the [ptx0/pseudo-journey-v2](https://hf.co/ptx0/pseudo-journey-v2) checkpoint which was trained with `v_prediction` and the [`DDIMScheduler`]. Configure the following parameters in the [`DDIMScheduler`]:
|
||||
|
||||
* `rescale_betas_zero_snr=True` to rescale the noise schedule to zero SNR
|
||||
* `timestep_spacing="trailing"` to start sampling from the last timestep
|
||||
|
||||
Set `guidance_rescale` in the pipeline to prevent over-exposure. A lower value increases brightness but some of the details may appear washed out.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
|
||||
|
||||
pipeline.scheduler = DDIMScheduler.from_config(
|
||||
pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
prompt = "cinematic photo of a snowy mountain at night with the northern lights aurora borealis overhead, 35mm photograph, film, professional, 4k, highly detailed"
|
||||
generator = torch.Generator(device="cpu").manual_seed(23)
|
||||
image = pipeline(prompt, guidance_rescale=0.7, generator=generator).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/no-zero-snr.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">default Stable Diffusion v2-1 image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero-snr.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">image with zero SNR and trailing timestep spacing enabled</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
@@ -212,62 +212,6 @@ images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
```
|
||||
|
||||
## Custom Timestep Schedules
|
||||
|
||||
With all our schedulers, you can choose one of the popular timestep schedules using configurations such as `timestep_spacing`, `interpolation_type`, and `use_karras_sigmas`. Some schedulers also provide the flexibility to use a custom timestep schedule. You can use any list of arbitrary timesteps, we will use the AYS timestep schedule here as example. It is a set of 10-step optimized timestep schedules released by researchers from Nvidia that can achieve significantly better quality compared to the preset timestep schedules. You can read more about their research [here](https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/).
|
||||
|
||||
```python
|
||||
from diffusers.schedulers import AysSchedules
|
||||
sampling_schedule = AysSchedules["StableDiffusionXLTimesteps"]
|
||||
print(sampling_schedule)
|
||||
```
|
||||
```
|
||||
[999, 845, 730, 587, 443, 310, 193, 116, 53, 13]
|
||||
```
|
||||
|
||||
You can then create a pipeline and pass this custom timestep schedule to it as `timesteps`.
|
||||
|
||||
```python
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
"SG161222/RealVisXL_V4.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
|
||||
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, algorithm_type="sde-dpmsolver++")
|
||||
|
||||
prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(2487854446)
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
negative_prompt="",
|
||||
generator=generator,
|
||||
timesteps=sampling_schedule,
|
||||
).images[0]
|
||||
```
|
||||
The generated image has better quality than the default linear timestep schedule for the same number of steps, and it is similar to the default timestep scheduler when running for 25 steps.
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ays.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">AYS timestep schedule 10 steps</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/10.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 10 steps</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/25.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 25 steps</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
> [!TIP]
|
||||
> 🤗 Diffusers currently only supports `timesteps` and `sigmas` for a selected list of schedulers and pipelines, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to extend feature to a scheduler and pipeline that does not currently support it!
|
||||
|
||||
|
||||
## Models
|
||||
|
||||
Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them.
|
||||
|
||||
@@ -1,84 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Load safetensors
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
[safetensors](https://github.com/huggingface/safetensors) is a safe and fast file format for storing and loading tensors. Typically, PyTorch model weights are saved or *pickled* into a `.bin` file with Python's [`pickle`](https://docs.python.org/3/library/pickle.html) utility. However, `pickle` is not secure and pickled files may contain malicious code that can be executed. safetensors is a secure alternative to `pickle`, making it ideal for sharing model weights.
|
||||
|
||||
This guide will show you how you load `.safetensor` files, and how to convert Stable Diffusion model weights stored in other formats to `.safetensor`. Before you start, make sure you have safetensors installed:
|
||||
|
||||
```py
|
||||
# uncomment to install the necessary libraries in Colab
|
||||
#!pip install safetensors
|
||||
```
|
||||
|
||||
If you look at the [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main) repository, you'll see weights inside the `text_encoder`, `unet` and `vae` subfolders are stored in the `.safetensors` format. By default, 🤗 Diffusers automatically loads these `.safetensors` files from their subfolders if they're available in the model repository.
|
||||
|
||||
For more explicit control, you can optionally set `use_safetensors=True` (if `safetensors` is not installed, you'll get an error message asking you to install it):
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
However, model weights are not necessarily stored in separate subfolders like in the example above. Sometimes, all the weights are stored in a single `.safetensors` file. In this case, if the weights are Stable Diffusion weights, you can load the file directly with the [`~diffusers.loaders.FromSingleFileMixin.from_single_file`] method:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_single_file(
|
||||
"https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors"
|
||||
)
|
||||
```
|
||||
|
||||
## Convert to safetensors
|
||||
|
||||
Not all weights on the Hub are available in the `.safetensors` format, and you may encounter weights stored as `.bin`. In this case, use the [Convert Space](https://huggingface.co/spaces/diffusers/convert) to convert the weights to `.safetensors`. The Convert Space downloads the pickled weights, converts them, and opens a Pull Request to upload the newly converted `.safetensors` file on the Hub. This way, if there is any malicious code contained in the pickled files, they're uploaded to the Hub - which has a [security scanner](https://huggingface.co/docs/hub/security-pickle#hubs-security-scanner) to detect unsafe files and suspicious pickle imports - instead of your computer.
|
||||
|
||||
You can use the model with the new `.safetensors` weights by specifying the reference to the Pull Request in the `revision` parameter (you can also test it in this [Check PR](https://huggingface.co/spaces/diffusers/check_pr) Space on the Hub), for example `refs/pr/22`:
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1", revision="refs/pr/22", use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
## Why use safetensors?
|
||||
|
||||
There are several reasons for using safetensors:
|
||||
|
||||
- Safety is the number one reason for using safetensors. As open-source and model distribution grows, it is important to be able to trust the model weights you downloaded don't contain any malicious code. The current size of the header in safetensors prevents parsing extremely large JSON files.
|
||||
- Loading speed between switching models is another reason to use safetensors, which performs zero-copy of the tensors. It is especially fast compared to `pickle` if you're loading the weights to CPU (the default case), and just as fast if not faster when directly loading the weights to GPU. You'll only notice the performance difference if the model is already loaded, and not if you're downloading the weights or loading the model for the first time.
|
||||
|
||||
The time it takes to load the entire pipeline:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", use_safetensors=True)
|
||||
"Loaded in safetensors 0:00:02.033658"
|
||||
"Loaded in PyTorch 0:00:02.663379"
|
||||
```
|
||||
|
||||
But the actual time it takes to load 500MB of the model weights is only:
|
||||
|
||||
```bash
|
||||
safetensors: 3.4873ms
|
||||
PyTorch: 172.7537ms
|
||||
```
|
||||
|
||||
- Lazy loading is also supported in safetensors, which is useful in distributed settings to only load some of the tensors. This format allowed the [BLOOM](https://huggingface.co/bigscience/bloom) model to be loaded in 45 seconds on 8 GPUs instead of 10 minutes with regular PyTorch weights.
|
||||
@@ -117,9 +117,15 @@
|
||||
title: Token Merging
|
||||
title: 최적화/특수 하드웨어
|
||||
- sections:
|
||||
- local: conceptual/philosophy
|
||||
title: 철학
|
||||
- local: using-diffusers/controlling_generation
|
||||
title: 제어된 생성
|
||||
- local: in_translation
|
||||
- local: conceptual/contribution
|
||||
title: 어떻게 기여하나요?
|
||||
- local: conceptual/ethical_guidelines
|
||||
title: Diffusers의 윤리적 가이드라인
|
||||
- local: conceptual/evaluation
|
||||
title: Diffusion Models 평가하기
|
||||
title: 개념 가이드
|
||||
- sections:
|
||||
|
||||
@@ -34,7 +34,7 @@ Stable Diffusion XL은 Dustin Podell, Zion English, Kyle Lacey, Andreas Blattman
|
||||
SDXL을 사용하기 전에 `transformers`, `accelerate`, `safetensors` 와 `invisible_watermark`를 설치하세요.
|
||||
다음과 같이 라이브러리를 설치할 수 있습니다:
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install transformers
|
||||
pip install accelerate
|
||||
pip install safetensors
|
||||
@@ -46,7 +46,7 @@ pip install invisible-watermark>=0.2.0
|
||||
Stable Diffusion XL로 이미지를 생성할 때 워터마크가 보이지 않도록 추가하는 것을 권장하는데, 이는 다운스트림(downstream) 어플리케이션에서 기계에 합성되었는지를 식별하는데 도움을 줄 수 있습니다. 그렇게 하려면 [invisible_watermark 라이브러리](https://pypi.org/project/invisible-watermark/)를 통해 설치해주세요:
|
||||
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install invisible-watermark>=0.2.0
|
||||
```
|
||||
|
||||
@@ -75,11 +75,11 @@ prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
image = pipe(prompt=prompt).images[0]
|
||||
```
|
||||
|
||||
### Image-to-image
|
||||
### Image-to-image
|
||||
|
||||
*image-to-image*를 위해 다음과 같이 SDXL을 사용할 수 있습니다:
|
||||
|
||||
```py
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLImg2ImgPipeline
|
||||
from diffusers.utils import load_image
|
||||
@@ -99,7 +99,7 @@ image = pipe(prompt, image=init_image).images[0]
|
||||
|
||||
*inpainting*를 위해 다음과 같이 SDXL을 사용할 수 있습니다:
|
||||
|
||||
```py
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLInpaintPipeline
|
||||
from diffusers.utils import load_image
|
||||
@@ -352,7 +352,7 @@ out-of-memory 에러가 난다면, [`StableDiffusionXLPipeline.enable_model_cpu_
|
||||
|
||||
**참고** Stable Diffusion XL을 `torch`가 2.0 버전 미만에서 실행시키고 싶을 때, xformers 어텐션을 사용해주세요:
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install xformers
|
||||
```
|
||||
|
||||
|
||||
512
docs/source/ko/conceptual/contribution.md
Normal file
512
docs/source/ko/conceptual/contribution.md
Normal file
@@ -0,0 +1,512 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Diffusers에 기여하는 방법 🧨
|
||||
|
||||
오픈 소스 커뮤니티에서의 기여를 환영합니다! 누구나 참여할 수 있으며, 코드뿐만 아니라 질문에 답변하거나 문서를 개선하는 등 모든 유형의 참여가 가치 있고 감사히 여겨집니다. 질문에 답변하고 다른 사람들을 도와주며 소통하고 문서를 개선하는 것은 모두 커뮤니티에게 큰 도움이 됩니다. 따라서 관심이 있다면 두려워하지 말고 참여해보세요!
|
||||
|
||||
누구나 우리의 공개 Discord 채널에서 👋 인사하며 시작할 수 있도록 장려합니다. 우리는 diffusion 모델의 최신 동향을 논의하고 질문을 하며 개인 프로젝트를 자랑하고 기여에 대해 서로 도와주거나 그냥 어울리기 위해 모이는 곳입니다☕. <a href="https://Discord.gg/G7tWnz98XR"><img alt="Join us on Discord" src="https://img.shields.io/discord/823813159592001537?color=5865F2&logo=discord&logoColor=white"></a>
|
||||
|
||||
어떤 방식으로든 기여하려는 경우, 우리는 개방적이고 환영하며 친근한 커뮤니티의 일부가 되기 위해 노력하고 있습니다. 우리의 [행동 강령](https://github.com/huggingface/diffusers/blob/main/CODE_OF_CONDUCT.md)을 읽고 상호 작용 중에 이를 존중하도록 주의해주시기 바랍니다. 또한 프로젝트를 안내하는 [윤리 지침](https://huggingface.co/docs/diffusers/conceptual/ethical_guidelines)에 익숙해지고 동일한 투명성과 책임성의 원칙을 준수해주시기를 부탁드립니다.
|
||||
|
||||
우리는 커뮤니티로부터의 피드백을 매우 중요하게 생각하므로, 라이브러리를 개선하는 데 도움이 될 가치 있는 피드백이 있다고 생각되면 망설이지 말고 의견을 제시해주세요 - 모든 메시지, 댓글, 이슈, 풀 리퀘스트(PR)는 읽히고 고려됩니다.
|
||||
|
||||
## 개요
|
||||
|
||||
이슈에 있는 질문에 답변하는 것에서부터 코어 라이브러리에 새로운 diffusion 모델을 추가하는 것까지 다양한 방법으로 기여를 할 수 있습니다.
|
||||
|
||||
이어지는 부분에서 우리는 다양한 방법의 기여에 대한 개요를 난이도에 따라 오름차순으로 정리하였습니다. 모든 기여는 커뮤니티에게 가치가 있습니다.
|
||||
|
||||
1. [Diffusers 토론 포럼](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers)이나 [Discord](https://discord.gg/G7tWnz98XR)에서 질문에 대답하거나 질문을 할 수 있습니다.
|
||||
2. [GitHub Issues 탭](https://github.com/huggingface/diffusers/issues/new/choose)에서 새로운 이슈를 열 수 있습니다.
|
||||
3. [GitHub Issues 탭](https://github.com/huggingface/diffusers/issues)에서 이슈에 대답할 수 있습니다.
|
||||
4. "Good first issue" 라벨이 지정된 간단한 이슈를 수정할 수 있습니다. [여기](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)를 참조하세요.
|
||||
5. [문서](https://github.com/huggingface/diffusers/tree/main/docs/source)에 기여할 수 있습니다.
|
||||
6. [Community Pipeline](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3Acommunity-examples)에 기여할 수 있습니다.
|
||||
7. [예제](https://github.com/huggingface/diffusers/tree/main/examples)에 기여할 수 있습니다.
|
||||
8. "Good second issue" 라벨이 지정된 어려운 이슈를 수정할 수 있습니다. [여기](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22Good+second+issue%22)를 참조하세요.
|
||||
9. 새로운 파이프라인, 모델 또는 스케줄러를 추가할 수 있습니다. ["새로운 파이프라인/모델"](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+pipeline%2Fmodel%22) 및 ["새로운 스케줄러"](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+scheduler%22) 이슈를 참조하세요. 이 기여에 대해서는 [디자인 철학](https://github.com/huggingface/diffusers/blob/main/PHILOSOPHY.md)을 확인해주세요.
|
||||
|
||||
앞서 말한 대로, **모든 기여는 커뮤니티에게 가치가 있습니다**. 이어지는 부분에서 각 기여에 대해 조금 더 자세히 설명하겠습니다.
|
||||
|
||||
4부터 9까지의 모든 기여에는 PR을 열어야 합니다. [PR을 열기](#how-to-open-a-pr)에서 자세히 설명되어 있습니다.
|
||||
|
||||
### 1. Diffusers 토론 포럼이나 Diffusers Discord에서 질문하고 답변하기
|
||||
|
||||
Diffusers 라이브러리와 관련된 모든 질문이나 의견은 [토론 포럼](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63)이나 [Discord](https://discord.gg/G7tWnz98XR)에서 할 수 있습니다. 이러한 질문과 의견에는 다음과 같은 내용이 포함됩니다(하지만 이에 국한되지는 않습니다):
|
||||
- 지식을 공유하기 위해서 훈련 또는 추론 실험에 대한 결과 보고
|
||||
- 개인 프로젝트 소개
|
||||
- 비공식 훈련 예제에 대한 질문
|
||||
- 프로젝트 제안
|
||||
- 일반적인 피드백
|
||||
- 논문 요약
|
||||
- Diffusers 라이브러리를 기반으로 하는 개인 프로젝트에 대한 도움 요청
|
||||
- 일반적인 질문
|
||||
- Diffusion 모델에 대한 윤리적 질문
|
||||
- ...
|
||||
|
||||
포럼이나 Discord에서 질문을 하면 커뮤니티가 지식을 공개적으로 공유하도록 장려되며, 미래에 동일한 질문을 가진 초보자에게도 도움이 될 수 있습니다. 따라서 궁금한 질문은 언제든지 하시기 바랍니다.
|
||||
또한, 이러한 질문에 답변하는 것은 커뮤니티에게 매우 큰 도움이 됩니다. 왜냐하면 이렇게 하면 모두가 학습할 수 있는 공개적인 지식을 문서화하기 때문입니다.
|
||||
|
||||
**주의**하십시오. 질문이나 답변에 투자하는 노력이 많을수록 공개적으로 문서화된 지식의 품질이 높아집니다. 마찬가지로, 잘 정의되고 잘 답변된 질문은 모두에게 접근 가능한 고품질 지식 데이터베이스를 만들어줍니다. 반면에 잘못된 질문이나 답변은 공개 지식 데이터베이스의 전반적인 품질을 낮출 수 있습니다.
|
||||
간단히 말해서, 고품질의 질문이나 답변은 *명확하고 간결하며 관련성이 있으며 이해하기 쉽고 접근 가능하며 잘 형식화되어 있어야* 합니다. 자세한 내용은 [좋은 이슈 작성 방법](#how-to-write-a-good-issue) 섹션을 참조하십시오.
|
||||
|
||||
**채널에 대한 참고사항**:
|
||||
[*포럼*](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63)은 구글과 같은 검색 엔진에서 더 잘 색인화됩니다. 게시물은 인기에 따라 순위가 매겨지며, 시간순으로 정렬되지 않습니다. 따라서 이전에 게시한 질문과 답변을 쉽게 찾을 수 있습니다.
|
||||
또한, 포럼에 게시된 질문과 답변은 쉽게 링크할 수 있습니다.
|
||||
반면 *Discord*는 채팅 형식으로 되어 있어 빠른 대화를 유도합니다.
|
||||
질문에 대한 답변을 빠르게 받을 수는 있겠지만, 시간이 지나면 질문이 더 이상 보이지 않습니다. 또한, Discord에서 이전에 게시된 정보를 찾는 것은 훨씬 어렵습니다. 따라서 포럼을 사용하여 고품질의 질문과 답변을 하여 커뮤니티를 위한 오래 지속되는 지식을 만들기를 권장합니다. Discord에서의 토론이 매우 흥미로운 답변과 결론을 이끌어내는 경우, 해당 정보를 포럼에 게시하여 미래 독자들에게 더 쉽게 액세스할 수 있도록 권장합니다.
|
||||
|
||||
### 2. GitHub 이슈 탭에서 새로운 이슈 열기
|
||||
|
||||
🧨 Diffusers 라이브러리는 사용자들이 마주치는 문제를 알려주는 덕분에 견고하고 신뢰할 수 있습니다. 따라서 이슈를 보고해주셔서 감사합니다.
|
||||
|
||||
기억해주세요, GitHub 이슈는 Diffusers 라이브러리와 직접적으로 관련된 기술적인 질문, 버그 리포트, 기능 요청 또는 라이브러리 디자인에 대한 피드백에 사용됩니다.
|
||||
|
||||
간단히 말해서, Diffusers 라이브러리의 **코드와 관련되지 않은** 모든 것(문서 포함)은 GitHub가 아닌 [포럼](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63)이나 [Discord](https://discord.gg/G7tWnz98XR)에서 질문해야 합니다.
|
||||
|
||||
**새로운 이슈를 열 때 다음 가이드라인을 고려해주세요**:
|
||||
- 이미 같은 이슈가 있는지 검색했는지 확인해주세요(GitHub의 이슈 탭에서 검색 기능을 사용하세요).
|
||||
- 다른(관련된) 이슈에 새로운 이슈를 보고하지 말아주세요. 다른 이슈와 관련이 높다면, 새로운 이슈를 열고 관련 이슈에 링크를 걸어주세요.
|
||||
- 이슈를 영어로 작성해주세요. 영어에 익숙하지 않다면, [DeepL](https://www.deepl.com/translator)과 같은 뛰어난 무료 온라인 번역 서비스를 사용하여 모국어에서 영어로 번역해주세요.
|
||||
- 이슈가 최신 Diffusers 버전으로 업데이트하면 해결될 수 있는지 확인해주세요. 이슈를 게시하기 전에 `python -c "import diffusers; print(diffusers.__version__)"` 명령을 실행하여 현재 사용 중인 Diffusers 버전이 최신 버전과 일치하거나 더 높은지 확인해주세요.
|
||||
- 새로운 이슈를 열 때 투자하는 노력이 많을수록 답변의 품질이 높아지고 Diffusers 이슈 전체의 품질도 향상됩니다.
|
||||
|
||||
#### 2.1 재현가능하고 최소한인 버그 리포트
|
||||
|
||||
새로운 이슈는 일반적으로 다음과 같은 내용을 포함합니다.
|
||||
|
||||
버그 보고서는 항상 재현 가능한 코드 조각을 포함하고 가능한 한 최소한이어야 하며 간결해야 합니다.
|
||||
자세히 말하면:
|
||||
- 버그를 가능한 한 좁혀야 합니다. **전체 코드 파일을 그냥 던지지 마세요**.
|
||||
- 코드의 서식을 지정해야 합니다.
|
||||
- Diffusers가 의존하는 외부 라이브러리를 제외한 다른 외부 라이브러리는 포함하지 마십시오.
|
||||
- **반드시** 환경에 대한 모든 필요한 정보를 제공해야 합니다. 이를 위해 쉘에서 `diffusers-cli env`를 실행하고 표시된 정보를 이슈에 복사하여 붙여넣을 수 있습니다.
|
||||
- 이슈를 설명해야 합니다. 독자가 문제가 무엇이며 왜 문제인지 모르면 해결할 수 없습니다.
|
||||
- **항상** 독자가 가능한 한 적은 노력으로 문제를 재현할 수 있도록 해야 합니다. 코드 조각이 라이브러리가 없거나 정의되지 않은 변수 때문에 실행되지 않는 경우 독자가 도움을 줄 수 없습니다. 재현 가능한 코드 조각이 가능한 한 최소화되고 간단한 Python 셸에 복사하여 붙여넣을 수 있도록 해야 합니다.
|
||||
- 문제를 재현하기 위해 모델과/또는 데이터셋이 필요한 경우 독자가 해당 모델이나 데이터셋에 접근할 수 있도록 해야 합니다. 모델이나 데이터셋을 [Hub](https://huggingface.co)에 업로드하여 쉽게 다운로드할 수 있도록 할 수 있습니다. 문제 재현을 가능한 한 쉽게하기 위해 모델과 데이터셋을 가능한 한 작게 유지하려고 노력하세요.
|
||||
|
||||
자세한 내용은 [좋은 이슈 작성 방법](#how-to-write-a-good-issue) 섹션을 참조하세요.
|
||||
|
||||
버그 보고서를 열려면 [여기](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=bug&projects=&template=bug-report.yml)를 클릭하세요.
|
||||
|
||||
|
||||
#### 2.2. 기능 요청
|
||||
|
||||
세계적인 기능 요청은 다음 사항을 다룹니다:
|
||||
|
||||
1. 먼저 동기부여:
|
||||
* 라이브러리와 관련된 문제/불만이 있는가요? 그렇다면 왜 그런지 설명해주세요. 문제를 보여주는 코드 조각을 제공하는 것이 가장 좋습니다.
|
||||
* 프로젝트에 필요한 기능인가요? 우리는 그에 대해 듣고 싶습니다!
|
||||
* 커뮤니티에 도움이 될 수 있는 것을 작업했고 그것에 대해 생각하고 있는가요? 멋지네요! 어떤 문제를 해결했는지 알려주세요.
|
||||
2. 기능을 *상세히 설명하는* 문단을 작성해주세요;
|
||||
3. 미래 사용을 보여주는 **코드 조각**을 제공해주세요;
|
||||
4. 이것이 논문과 관련된 경우 링크를 첨부해주세요;
|
||||
5. 도움이 될 수 있는 추가 정보(그림, 스크린샷 등)를 첨부해주세요.
|
||||
|
||||
기능 요청은 [여기](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feature_request.md&title=)에서 열 수 있습니다.
|
||||
|
||||
#### 2.3 피드백
|
||||
|
||||
라이브러리 디자인과 그것이 왜 좋은지 또는 나쁜지에 대한 이유에 대한 피드백은 핵심 메인테이너가 사용자 친화적인 라이브러리를 만드는 데 엄청난 도움이 됩니다. 현재 디자인 철학을 이해하려면 [여기](https://huggingface.co/docs/diffusers/conceptual/philosophy)를 참조해 주세요. 특정 디자인 선택이 현재 디자인 철학과 맞지 않는다고 생각되면, 그 이유와 어떻게 변경되어야 하는지 설명해 주세요. 반대로 특정 디자인 선택이 디자인 철학을 너무 따르기 때문에 사용 사례를 제한한다고 생각되면, 그 이유와 어떻게 변경되어야 하는지 설명해 주세요. 특정 디자인 선택이 매우 유용하다고 생각되면, 미래의 디자인 결정에 큰 도움이 되므로 이에 대한 의견을 남겨 주세요.
|
||||
|
||||
피드백에 관한 이슈는 [여기](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=)에서 열 수 있습니다.
|
||||
|
||||
#### 2.4 기술적인 질문
|
||||
|
||||
기술적인 질문은 주로 라이브러리의 특정 코드가 왜 특정 방식으로 작성되었는지 또는 코드의 특정 부분이 무엇을 하는지에 대한 질문입니다. 질문하신 코드 부분에 대한 링크를 제공하고 해당 코드 부분이 이해하기 어려운 이유에 대한 자세한 설명을 해주시기 바랍니다.
|
||||
|
||||
기술적인 질문에 관한 이슈를 [여기](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=bug&template=bug-report.yml)에서 열 수 있습니다.
|
||||
|
||||
#### 2.5 새로운 모델, 스케줄러 또는 파이프라인 추가 제안
|
||||
|
||||
만약 diffusion 모델 커뮤니티에서 Diffusers 라이브러리에 추가하고 싶은 새로운 모델, 파이프라인 또는 스케줄러가 있다면, 다음 정보를 제공해주세요:
|
||||
|
||||
* Diffusion 파이프라인, 모델 또는 스케줄러에 대한 간단한 설명과 논문 또는 공개된 버전의 링크
|
||||
* 해당 모델의 오픈 소스 구현에 대한 링크
|
||||
* 모델 가중치가 있는 경우, 가중치의 링크
|
||||
|
||||
모델에 직접 기여하고자 하는 경우, 최선의 안내를 위해 우리에게 알려주세요. 또한, 가능하다면 구성 요소(모델, 스케줄러, 파이프라인 등)의 원래 저자를 GitHub 핸들로 태그하는 것을 잊지 마세요.
|
||||
|
||||
모델/파이프라인/스케줄러에 대한 요청을 [여기](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=New+model%2Fpipeline%2Fscheduler&template=new-model-addition.yml)에서 열 수 있습니다.
|
||||
|
||||
### 3. GitHub 이슈 탭에서 문제에 대한 답변하기
|
||||
|
||||
GitHub에서 이슈에 대한 답변을 하기 위해서는 Diffusers에 대한 기술적인 지식이 필요할 수 있지만, 정확한 답변이 아니더라도 모두가 시도해기를 권장합니다. 이슈에 대한 고품질 답변을 제공하기 위한 몇 가지 팁:
|
||||
- 가능한 한 간결하고 최소한으로 유지합니다.
|
||||
- 주제에 집중합니다. 이슈에 대한 답변은 해당 이슈에 관련된 내용에만 집중해야 합니다.
|
||||
- 코드, 논문 또는 다른 소스를 제공하여 답변을 증명하거나 지지합니다.
|
||||
- 코드로 답변합니다. 간단한 코드 조각이 이슈에 대한 답변이거나 이슈를 해결하는 방법을 보여준다면, 완전히 재현 가능한 코드 조각을 제공해주세요.
|
||||
|
||||
또한, 많은 이슈들은 단순히 주제와 무관하거나 다른 이슈의 중복이거나 관련이 없는 경우가 많습니다. 이러한 이슈들에 대한 답변을 제공하고, 이슈 작성자에게 더 정확한 정보를 제공하거나, 중복된 이슈에 대한 링크를 제공하거나, [포럼](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) 이나 [Discord](https://discord.gg/G7tWnz98XR)로 리디렉션하는 것은 메인테이너에게 큰 도움이 됩니다.
|
||||
|
||||
이슈가 올바른 버그 보고서이고 소스 코드에서 수정이 필요하다고 확인한 경우, 다음 섹션을 살펴보세요.
|
||||
|
||||
다음 모든 기여에 대해서는 PR을 열여야 합니다. [PR 열기](#how-to-open-a-pr) 섹션에서 자세히 설명되어 있습니다.
|
||||
|
||||
### 4. "Good first issue" 고치기
|
||||
|
||||
*Good first issues*는 [Good first issue](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) 라벨로 표시됩니다. 일반적으로, 이슈는 이미 잠재적인 해결책이 어떻게 보이는지 설명하고 있어서 수정하기 쉽습니다.
|
||||
만약 이슈가 아직 닫히지 않았고 이 문제를 해결해보고 싶다면, "이 이슈를 해결해보고 싶습니다."라는 메시지를 남기면 됩니다. 일반적으로 세 가지 시나리오가 있습니다:
|
||||
- a.) 이슈 설명이 이미 해결책을 제안합니다. 이 경우, 해결책이 이해되고 합리적으로 보인다면, PR 또는 드래프트 PR을 열어서 수정할 수 있습니다.
|
||||
- b.) 이슈 설명이 해결책을 제안하지 않습니다. 이 경우, 어떤 해결책이 가능할지 물어볼 수 있고, Diffusers 팀의 누군가가 곧 답변해줄 것입니다. 만약 어떻게 수정할지 좋은 아이디어가 있다면, 직접 PR을 열어도 됩니다.
|
||||
- c.) 이미 이 문제를 해결하기 위해 열린 PR이 있지만, 이슈가 아직 닫히지 않았습니다. PR이 더 이상 진행되지 않았다면, 새로운 PR을 열고 이전 PR에 링크를 걸면 됩니다. PR은 종종 원래 기여자가 갑자기 시간을 내지 못해 더 이상 진행하지 못하는 경우에 더 이상 진행되지 않게 됩니다. 이는 오픈 소스에서 자주 발생하는 일이며 매우 정상적인 상황입니다. 이 경우, 커뮤니티는 새로 시도하고 기존 PR의 지식을 활용해주면 매우 기쁠 것입니다. 이미 PR이 있고 활성화되어 있다면, 제안을 해주거나 PR을 검토하거나 PR에 기여할 수 있는지 물어보는 등 작성자를 도와줄 수 있습니다.
|
||||
|
||||
|
||||
### 5. 문서에 기여하기
|
||||
|
||||
좋은 라이브러리는 항상 좋은 문서를 갖고 있습니다! 공식 문서는 라이브러리를 처음 사용하는 사용자들에게 첫 번째 접점 중 하나이며, 따라서 문서에 기여하는 것은 매우 가치 있는 기여입니다.
|
||||
|
||||
라이브러리에 기여하는 방법은 다양합니다:
|
||||
|
||||
- 맞춤법이나 문법 오류를 수정합니다.
|
||||
- 공식 문서가 이상하게 표시되거나 링크가 깨진 경우, 올바르게 수정하는 데 시간을 내주시면 매우 기쁠 것입니다.
|
||||
- 문서의 입력 또는 출력 텐서의 모양이나 차원을 수정합니다.
|
||||
- 이해하기 어렵거나 잘못된 문서를 명확하게 합니다.
|
||||
- 오래된 코드 예제를 업데이트합니다.
|
||||
- 문서를 다른 언어로 번역합니다.
|
||||
|
||||
[공식 Diffusers 문서 페이지](https://huggingface.co/docs/diffusers/index)에 표시된 모든 내용은 공식 문서의 일부이며, 해당 [문서 소스](https://github.com/huggingface/diffusers/tree/main/docs/source)에서 수정할 수 있습니다.
|
||||
|
||||
문서에 대한 변경 사항을 로컬에서 확인하는 방법은 [이 페이지](https://github.com/huggingface/diffusers/tree/main/docs)를 참조해주세요.
|
||||
|
||||
|
||||
### 6. 커뮤니티 파이프라인에 기여하기
|
||||
|
||||
> [!TIP]
|
||||
> 커뮤니티 파이프라인에 대해 자세히 알아보려면 [커뮤니티 파이프라인](../using-diffusers/custom_pipeline_overview#community-pipelines) 가이드를 읽어보세요. 커뮤니티 파이프라인이 왜 필요한지 궁금하다면 GitHub 이슈 [#841](https://github.com/huggingface/diffusers/issues/841)를 확인해보세요 (기본적으로, 우리는 diffusion 모델이 추론에 사용될 수 있는 모든 방법을 유지할 수 없지만 커뮤니티가 이를 구축하는 것을 방해하고 싶지 않습니다).
|
||||
|
||||
커뮤니티 파이프라인에 기여하는 것은 창의성과 작업을 커뮤니티와 공유하는 좋은 방법입니다. [`DiffusionPipeline`]을 기반으로 빌드하여 `custom_pipeline` 매개변수를 설정함으로써 누구나 로드하고 사용할 수 있도록 할 수 있습니다. 이 섹션에서는 UNet이 단일 순방향 패스만 수행하고 스케줄러를 한 번 호출하는 간단한 파이프라인 (단계별 파이프라인)을 만드는 방법을 안내합니다.
|
||||
|
||||
1. 커뮤니티 파이프라인을 위한 one_step_unet.py 파일을 생성하세요. 이 파일은 사용자에 의해 설치되는 패키지를 포함할 수 있지만, [`DiffusionPipeline`]에서 모델 가중치와 스케줄러 구성을 로드하기 위해 하나의 파이프라인 클래스만 있어야 합니다. `__init__` 함수에 UNet과 스케줄러를 추가하세요.
|
||||
|
||||
또한 [`~DiffusionPipeline.save_pretrained`]를 사용하여 파이프라인과 그 구성 요소를 저장할 수 있도록 `register_modules` 함수를 추가해야 합니다.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
```
|
||||
|
||||
1. forward 패스에서 (`__call__`로 정의하는 것을 추천합니다), 원하는 어떤 기능이든 추가할 수 있습니다. "one-step" 파이프라인의 경우, 무작위 이미지를 생성하고 `timestep=1`로 설정하여 UNet과 스케줄러를 한 번 호출합니다.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
|
||||
def __call__(self):
|
||||
image = torch.randn(
|
||||
(1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
|
||||
)
|
||||
timestep = 1
|
||||
|
||||
model_output = self.unet(image, timestep).sample
|
||||
scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
|
||||
|
||||
return scheduler_output
|
||||
```
|
||||
|
||||
이제 UNet과 스케줄러를 전달하여 파이프라인을 실행하거나, 파이프라인 구조가 동일한 경우 사전 학습된 가중치를 로드할 수 있습니다.
|
||||
|
||||
```py
|
||||
from diffusers import DDPMScheduler, UNet2DModel
|
||||
|
||||
scheduler = DDPMScheduler()
|
||||
unet = UNet2DModel()
|
||||
|
||||
pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
|
||||
output = pipeline()
|
||||
# load pretrained weights
|
||||
pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
output = pipeline()
|
||||
```
|
||||
|
||||
파이프라인을 GitHub 커뮤니티 파이프라인 또는 Hub 커뮤니티 파이프라인으로 공유할 수 있습니다.
|
||||
|
||||
<hfoptions id="pipeline type">
|
||||
<hfoption id="GitHub pipeline">
|
||||
|
||||
GitHub 파이프라인을 공유하려면 Diffusers [저장소](https://github.com/huggingface/diffusers)에서 PR을 열고 one_step_unet.py 파일을 [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) 하위 폴더에 추가하세요.
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Hub pipeline">
|
||||
|
||||
Hub 파이프라인을 공유하려면, 허브에 모델 저장소를 생성하고 one_step_unet.py 파일을 업로드하세요.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### 7. 훈련 예제에 기여하기
|
||||
|
||||
Diffusers 예제는 [examples](https://github.com/huggingface/diffusers/tree/main/examples) 폴더에 있는 훈련 스크립트의 모음입니다.
|
||||
|
||||
두 가지 유형의 훈련 예제를 지원합니다:
|
||||
|
||||
- 공식 훈련 예제
|
||||
- 연구용 훈련 예제
|
||||
|
||||
연구용 훈련 예제는 [examples/research_projects](https://github.com/huggingface/diffusers/tree/main/examples/research_projects)에 위치하며, 공식 훈련 예제는 `research_projects` 및 `community` 폴더를 제외한 [examples](https://github.com/huggingface/diffusers/tree/main/examples)의 모든 폴더를 포함합니다.
|
||||
공식 훈련 예제는 Diffusers의 핵심 메인테이너가 유지 관리하며, 연구용 훈련 예제는 커뮤니티가 유지 관리합니다.
|
||||
이는 공식 파이프라인 vs 커뮤니티 파이프라인에 대한 [6. 커뮤니티 파이프라인 기여하기](#6-contribute-a-community-pipeline)에서 제시한 이유와 동일합니다: 핵심 메인테이너가 diffusion 모델의 모든 가능한 훈련 방법을 유지 관리하는 것은 현실적으로 불가능합니다.
|
||||
Diffusers 핵심 메인테잉너와 커뮤니티가 특정 훈련 패러다임을 너무 실험적이거나 충분히 인기 없는 것으로 간주하는 경우, 해당 훈련 코드는 `research_projects` 폴더에 넣고 작성자가 유지 관리해야 합니다.
|
||||
|
||||
공식 훈련 및 연구 예제는 하나 이상의 훈련 스크립트, requirements.txt 파일 및 README.md 파일을 포함하는 디렉토리로 구성됩니다. 사용자가 훈련 예제를 사용하려면 리포지토리를 복제해야 합니다:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/diffusers
|
||||
```
|
||||
|
||||
그리고 훈련에 필요한 모든 추가적인 의존성도 설치해야 합니다:
|
||||
|
||||
```bash
|
||||
pip install -r /examples/<your-example-folder>/requirements.txt
|
||||
```
|
||||
|
||||
따라서 예제를 추가할 때, `requirements.txt` 파일은 훈련 예제에 필요한 모든 pip 종속성을 정의해야 합니다. 이렇게 설치된 모든 종속성을 사용하여 사용자가 예제의 훈련 스크립트를 실행할 수 있어야 합니다. 예를 들어, [DreamBooth `requirements.txt` 파일](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/requirements.txt)을 참조하세요.
|
||||
|
||||
Diffusers 라이브러리의 훈련 예제는 다음 철학을 따라야 합니다:
|
||||
- 예제를 실행하는 데 필요한 모든 코드는 하나의 Python 파일에 있어야 합니다.
|
||||
- 사용자는 명령 줄에서 `python <your-example>.py --args`와 같이 예제를 실행할 수 있어야 합니다.
|
||||
- 예제는 간단하게 유지되어야 하며, Diffusers를 사용한 훈련 방법을 보여주는 **예시**로 사용되어야 합니다. 예제 스크립트의 목적은 최첨단 diffusion 모델을 만드는 것이 아니라, 너무 많은 사용자 정의 로직을 추가하지 않고 이미 알려진 훈련 방법을 재현하는 것입니다. 이 점의 부산물로서, 예제는 좋은 교육 자료로써의 역할을 하기 위해 노력합니다.
|
||||
|
||||
예제에 기여하기 위해서는, 이미 존재하는 예제인 [dreambooth](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py)와 같은 예제를 참고하여 어떻게 보여야 하는지에 대한 아이디어를 얻는 것이 매우 권장됩니다.
|
||||
Diffusers와 긴밀하게 통합되어 있기 때문에, 기여자들이 [Accelerate 라이브러리](https://github.com/huggingface/accelerate)를 사용하는 것을 강력히 권장합니다.
|
||||
예제 스크립트가 작동하는 경우, 반드시 예제를 정확하게 사용하는 방법을 설명하는 포괄적인 `README.md`를 추가해야 합니다. 이 README에는 다음이 포함되어야 합니다:
|
||||
- [여기](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#running-locally-with-pytorch)에 표시된 예제 스크립트를 실행하는 방법에 대한 예제 명령어.
|
||||
- [여기](https://api.wandb.ai/report/patrickvonplaten/xm6cd5q5)에 표시된 훈련 결과 (로그, 모델 등)에 대한 링크로 사용자가 기대할 수 있는 내용을 보여줍니다.
|
||||
- 비공식/연구용 훈련 예제를 추가하는 경우, **반드시** git 핸들을 포함하여 이 훈련 예제를 유지 관리할 것임을 명시하는 문장을 추가해야 합니다. [여기](https://github.com/huggingface/diffusers/tree/main/examples/research_projects/intel_opts#diffusers-examples-with-intel-optimizations)에 표시된 것과 같습니다.
|
||||
|
||||
만약 공식 훈련 예제에 기여하는 경우, [examples/test_examples.py](https://github.com/huggingface/diffusers/blob/main/examples/test_examples.py)에 테스트를 추가하는 것도 확인해주세요. 비공식 훈련 예제에는 이 작업이 필요하지 않습니다.
|
||||
|
||||
### 8. "Good second issue" 고치기
|
||||
|
||||
"Good second issue"는 [Good second issue](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22Good+second+issue%22) 라벨로 표시됩니다. Good second issue는 [Good first issues](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)보다 해결하기가 더 복잡합니다.
|
||||
이슈 설명은 일반적으로 이슈를 해결하는 방법에 대해 덜 구체적이며, 관심 있는 기여자는 라이브러리에 대한 꽤 깊은 이해가 필요합니다.
|
||||
Good second issue를 해결하고자 하는 경우, 해당 이슈를 해결하기 위해 PR을 열고 PR을 이슈에 링크하세요. 이미 해당 이슈에 대한 PR이 열려있지만 병합되지 않은 경우, 왜 병합되지 않았는지 이해하기 위해 살펴보고 개선된 PR을 열어보세요.
|
||||
Good second issue는 일반적으로 Good first issue 이슈보다 병합하기가 더 어려우므로, 핵심 메인테이너에게 도움을 요청하는 것이 좋습니다. PR이 거의 완료된 경우, 핵심 메인테이너는 PR에 참여하여 커밋하고 병합을 진행할 수 있습니다.
|
||||
|
||||
### 9. 파이프라인, 모델, 스케줄러 추가하기
|
||||
|
||||
파이프라인, 모델, 스케줄러는 Diffusers 라이브러리에서 가장 중요한 부분입니다.
|
||||
이들은 최첨단 diffusion 기술에 쉽게 접근하도록 하며, 따라서 커뮤니티가 강력한 생성형 AI 애플리케이션을 만들 수 있도록 합니다.
|
||||
|
||||
새로운 모델, 파이프라인 또는 스케줄러를 추가함으로써, 사용자 인터페이스에 새로운 강력한 사용 사례를 활성화할 수 있으며, 이는 전체 생성형 AI 생태계에 매우 중요한 가치를 제공할 수 있습니다.
|
||||
|
||||
Diffusers에는 세 가지 구성 요소에 대한 여러 개발 요청이 있습니다. 특정 구성 요소를 아직 정확히 어떤 것을 추가하고 싶은지 모르는 경우, 다음 링크를 참조하세요:
|
||||
- [모델 또는 파이프라인](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+pipeline%2Fmodel%22)
|
||||
- [스케줄러](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+scheduler%22)
|
||||
|
||||
|
||||
세 가지 구성 요소를 추가하기 전에, [철학 가이드](philosophy)를 읽어보는 것을 강력히 권장합니다. 세 가지 구성 요소 중 어느 것을 추가하든, 디자인 철학과 관련된 API 일관성을 유지하기 위해 우리의 디자인 철학과 크게 다른 구성 요소는 병합할 수 없습니다. 디자인 선택에 근본적으로 동의하지 않는 경우, [피드백 이슈](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=)를 열어 해당 디자인 패턴/선택이 라이브러리 전체에서 변경되어야 하는지, 디자인 철학을 업데이트해야 하는지에 대해 논의할 수 있습니다. 라이브러리 전체의 일관성은 우리에게 매우 중요합니다.
|
||||
|
||||
PR에 원본 코드베이스/논문 링크를 추가하고, 가능하면 PR에서 원래 작성자에게 직접 알림을 보내어 진행 상황을 따라갈 수 있도록 해주세요.
|
||||
|
||||
PR에서 막힌 경우나 도움이 필요한 경우, 첫 번째 리뷰나 도움을 요청하는 메시지를 남기는 것을 주저하지 마세요.
|
||||
|
||||
#### Copied from mechanism
|
||||
|
||||
`# Copied from mechanism` 은 파이프라인, 모델 또는 스케줄러 코드를 추가할 때 이해해야 할 독특하고 중요한 기능입니다. Diffusers 코드베이스 전체에서 이를 자주 볼 수 있는데, 이를 사용하는 이유는 코드베이스를 이해하기 쉽고 유지 관리하기 쉽게 유지하기 위함입니다. `# Copied from mechanism` 으로 표시된 코드는 복사한 코드와 정확히 동일하도록 강제됩니다. 이를 통해 `make fix-copies`를 실행할 때 많은 파일에 걸쳐 변경 사항을 쉽게 업데이트하고 전파할 수 있습니다.
|
||||
|
||||
예를 들어, 아래 코드 예제에서 [`~diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput`]은 원래 코드이며, `AltDiffusionPipelineOutput`은 `# Copied from mechanism`을 사용하여 복사합니다. 유일한 차이점은 클래스 접두사를 `Stable`에서 `Alt`로 변경한 것입니다.
|
||||
|
||||
```py
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_output.StableDiffusionPipelineOutput with Stable->Alt
|
||||
class AltDiffusionPipelineOutput(BaseOutput):
|
||||
"""
|
||||
Output class for Alt Diffusion pipelines.
|
||||
|
||||
Args:
|
||||
images (`List[PIL.Image.Image]` or `np.ndarray`)
|
||||
List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
|
||||
num_channels)`.
|
||||
nsfw_content_detected (`List[bool]`)
|
||||
List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
|
||||
`None` if safety checking could not be performed.
|
||||
"""
|
||||
```
|
||||
|
||||
더 자세히 알고 싶다면 [~Don't~ Repeat Yourself*](https://huggingface.co/blog/transformers-design-philosophy#4-machine-learning-models-are-static) 블로그 포스트의 이 섹션을 읽어보세요.
|
||||
|
||||
## 좋은 이슈 작성 방법
|
||||
|
||||
**이슈를 잘 작성할수록 빠르게 해결될 가능성이 높아집니다.**
|
||||
|
||||
1. 이슈에 적절한 템플릿을 사용했는지 확인하세요. [새 이슈를 열 때](https://github.com/huggingface/diffusers/issues/new/choose) 올바른 템플릿을 선택해야 합니다. *버그 보고서*, *기능 요청*, *API 디자인에 대한 피드백*, *새로운 모델/파이프라인/스케줄러 추가*, *포럼*, 또는 빈 이슈 중에서 선택하세요. 이슈를 열 때 올바른 템플릿을 선택하는 것이 중요합니다.
|
||||
2. **명확성**: 이슈에 적합한 제목을 지정하세요. 이슈 설명을 가능한 간단하게 작성하세요. 이슈를 이해하고 해결하는 데 걸리는 시간을 줄이기 위해 가능한 한 명확하게 작성하세요. 하나의 이슈에 대해 여러 문제를 포함하지 않도록 주의하세요. 여러 문제를 발견한 경우, 각각의 이슈를 개별적으로 열어주세요. 버그인 경우, 어떤 버그인지 가능한 한 정확하게 설명해야 합니다. "diffusers에서 오류"와 같이 간단히 작성하지 마세요.
|
||||
3. **재현 가능성**: 재현 가능한 코드 조각이 없으면 해결할 수 없습니다. 버그를 발견한 경우, 유지 관리자는 그 버그를 재현할 수 있어야 합니다. 이슈에 재현 가능한 코드 조각을 포함해야 합니다. 코드 조각은 Python 인터프리터에 복사하여 붙여넣을 수 있는 형태여야 합니다. 코드 조각이 작동해야 합니다. 즉, 누락된 import나 이미지에 대한 링크가 없어야 합니다. 이슈에는 오류 메시지와 정확히 동일한 오류 메시지를 재현하기 위해 수정하지 않고 복사하여 붙여넣을 수 있는 코드 조각이 포함되어야 합니다. 이슈에 사용자의 로컬 모델 가중치나 로컬 데이터를 사용하는 경우, 독자가 액세스할 수 없는 경우 이슈를 해결할 수 없습니다. 데이터나 모델을 공유할 수 없는 경우, 더미 모델이나 더미 데이터를 만들어 사용해보세요.
|
||||
4. **간결성**: 가능한 한 간결하게 유지하여 독자가 문제를 빠르게 이해할 수 있도록 도와주세요. 문제와 관련이 없는 코드나 정보는 모두 제거해주세요. 버그를 발견한 경우, 문제를 설명하는 가장 간단한 코드 예제를 만들어보세요. 버그를 발견한 후에는 작업 흐름 전체를 문제에 던지는 것이 아니라, 에러가 발생하는 훈련 코드의 어느 부분이 문제인지 먼저 이해하고 몇 줄로 재현해보세요. 전체 데이터셋 대신 더미 데이터를 사용해보세요.
|
||||
5. 링크 추가하기. 특정한 이름, 메서드, 또는 모델을 참조하는 경우, 독자가 더 잘 이해할 수 있도록 링크를 제공해주세요. 특정 PR이나 이슈를 참조하는 경우, 해당 이슈에 링크를 걸어주세요. 독자가 무엇을 말하는지 알고 있다고 가정하지 마세요. 이슈에 링크를 추가할수록 좋습니다.
|
||||
6. 포맷팅. 파이썬 코드 구문으로 코드를 포맷팅하고, 일반 코드 구문으로 에러 메시지를 포맷팅해주세요. 자세한 내용은 [공식 GitHub 포맷팅 문서](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax)를 참조하세요.
|
||||
7. 이슈를 해결해야 하는 티켓이 아니라, 잘 작성된 백과사전 항목으로 생각해보세요. 추가된 이슈는 공개적으로 사용 가능한 지식에 기여하는 것입니다. 잘 작성된 이슈를 추가함으로써 메인테이너가 문제를 해결하는 데 도움을 주는 것뿐만 아니라, 전체 커뮤니티가 라이브러리의 특정 측면을 더 잘 이해할 수 있도록 도움을 주는 것입니다.
|
||||
|
||||
## 좋은 PR 작성 방법
|
||||
|
||||
1. 카멜레온이 되세요. 기존의 디자인 패턴과 구문을 이해하고, 코드 추가가 기존 코드베이스에 매끄럽게 흐르도록 해야 합니다. 기존 디자인 패턴이나 사용자 인터페이스와 크게 다른 PR은 병합되지 않습니다.
|
||||
2. 초점을 맞추세요. 하나의 문제만 해결하는 PR을 작성해야 합니다. "추가하면서 다른 문제도 해결하기"에 빠지지 않도록 주의하세요. 여러 개의 관련 없는 문제를 해결하는 PR을 작성하는 것은 리뷰하기가 훨씬 어렵습니다.
|
||||
3. 도움이 되는 경우, 추가한 내용이 어떻게 사용되는지 예제 코드 조각을 추가해보세요.
|
||||
4. PR의 제목은 기여 내용을 요약해야 합니다.
|
||||
5. PR이 이슈를 해결하는 경우, PR 설명에 이슈 번호를 언급하여 연결되도록 해주세요 (이슈를 참조하는 사람들이 작업 중임을 알 수 있도록).
|
||||
6. 진행 중인 작업을 나타내려면 제목에 `[WIP]`를 접두사로 붙여주세요. 이는 중복 작업을 피하고, 병합 준비가 된 PR과 구분할 수 있도록 도움이 됩니다.
|
||||
7. [좋은 이슈를 작성하는 방법](#how-to-write-a-good-issue)에 설명된 대로 텍스트를 구성하고 형식을 지정해보세요.
|
||||
8. 기존 테스트가 통과하는지 확인하세요
|
||||
9. 높은 커버리지를 가진 테스트를 추가하세요. 품질 테스트가 없으면 병합할 수 없습니다.
|
||||
- 새로운 `@slow` 테스트를 추가하는 경우, 다음 명령을 사용하여 통과하는지 확인하세요.
|
||||
`RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`.
|
||||
CircleCI는 느린 테스트를 실행하지 않지만, GitHub Actions는 매일 실행합니다!
|
||||
10. 모든 공개 메서드는 마크다운과 잘 작동하는 정보성 docstring을 가져야 합니다. 예시로 [`pipeline_latent_diffusion.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py)를 참조하세요.
|
||||
11. 레포지토리가 빠르게 성장하고 있기 때문에, 레포지토리에 큰 부담을 주는 파일이 추가되지 않도록 주의해야 합니다. 이미지, 비디오 및 기타 텍스트가 아닌 파일을 포함합니다. 이러한 파일을 배치하기 위해 hf.co 호스팅 `dataset`인 [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) 또는 [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images)를 활용하는 것이 우선입니다.
|
||||
외부 기여인 경우, 이미지를 PR에 추가하고 Hugging Face 구성원에게 이미지를 이 데이터셋으로 이동하도록 요청하세요.
|
||||
|
||||
## PR을 열기 위한 방법
|
||||
|
||||
코드를 작성하기 전에, 이미 누군가가 같은 작업을 하고 있는지 확인하기 위해 기존의 PR이나 이슈를 검색하는 것이 좋습니다. 확실하지 않은 경우, 피드백을 받기 위해 이슈를 열어보는 것이 항상 좋은 아이디어입니다.
|
||||
|
||||
🧨 Diffusers에 기여하기 위해서는 기본적인 `git` 사용법을 알아야 합니다. `git`은 가장 쉬운 도구는 아니지만, 가장 훌륭한 매뉴얼을 가지고 있습니다. 셸에서 `git --help`을 입력하고 즐기세요. 책을 선호하는 경우, [Pro Git](https://git-scm.com/book/en/v2)은 매우 좋은 참고 자료입니다.
|
||||
|
||||
다음 단계를 따라 기여를 시작하세요 ([지원되는 Python 버전](https://github.com/huggingface/diffusers/blob/main/setup.py#L244)):
|
||||
|
||||
1. 저장소 페이지에서 'Fork' 버튼을 클릭하여 [저장소](https://github.com/huggingface/diffusers)를 포크합니다. 이렇게 하면 코드의 사본이 GitHub 사용자 계정에 생성됩니다.
|
||||
|
||||
2. 포크한 저장소를 로컬 디스크에 클론하고, 기본 저장소를 원격으로 추가하세요:
|
||||
|
||||
```bash
|
||||
$ git clone git@github.com:<your GitHub handle>/diffusers.git
|
||||
$ cd diffusers
|
||||
$ git remote add upstream https://github.com/huggingface/diffusers.git
|
||||
```
|
||||
|
||||
3. 개발 변경 사항을 보관할 새로운 브랜치를 생성하세요:
|
||||
|
||||
```bash
|
||||
$ git checkout -b a-descriptive-name-for-my-changes
|
||||
```
|
||||
|
||||
`main` 브랜치 위에서 **절대** 작업하지 마세요.
|
||||
|
||||
1. 가상 환경에서 다음 명령을 실행하여 개발 환경을 설정하세요:
|
||||
|
||||
```bash
|
||||
$ pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
만약 저장소를 이미 클론한 경우, 가장 최신 변경 사항을 가져오기 위해 `git pull`을 실행해야 할 수도 있습니다.
|
||||
|
||||
5. 기능을 브랜치에서 개발하세요.
|
||||
|
||||
기능을 작업하는 동안 테스트 스위트가 통과되는지 확인해야 합니다. 다음과 같이 변경 사항에 영향을 받는 테스트를 실행해야 합니다:
|
||||
|
||||
```bash
|
||||
$ pytest tests/<TEST_TO_RUN>.py
|
||||
```
|
||||
|
||||
테스트를 실행하기 전에 테스트를 위해 필요한 의존성들을 설치하였는지 확인하세요. 다음의 커맨드를 통해서 확인할 수 있습니다:
|
||||
|
||||
```bash
|
||||
$ pip install -e ".[test]"
|
||||
```
|
||||
|
||||
다음 명령어로 전체 테스트 묶음 실행할 수도 있지만, Diffusers가 많이 성장하였기 때문에 결과를 적당한 시간 내에 생성하기 위해서는 강력한 컴퓨터가 필요합니다. 다음은 해당 명령어입니다:
|
||||
|
||||
```bash
|
||||
$ make test
|
||||
```
|
||||
|
||||
🧨 Diffusers는 소스 코드를 일관되게 포맷팅하기 위해 `black`과 `isort`를 사용합니다. 변경 사항을 적용한 후에는 다음과 같이 자동 스타일 수정 및 코드 검증을 적용할 수 있습니다:
|
||||
|
||||
|
||||
```bash
|
||||
$ make style
|
||||
```
|
||||
|
||||
🧨 Diffusers `ruff`와 몇개의 커스텀 스크립트를 이용하여 코딩 실수를 확인합니다. 품질 제어는 CI에서 작동하지만, 동일한 검사를 다음을 통해서도 할 수 있습니다:
|
||||
|
||||
```bash
|
||||
$ make quality
|
||||
```
|
||||
|
||||
변경사항에 대해 만족한다면 `git add`를 사용하여 변경된 파일을 추가하고 `git commit`을 사용하여 변경사항에 대해 로컬상으로 저장한다:
|
||||
|
||||
```bash
|
||||
$ git add modified_file.py
|
||||
$ git commit -m "A descriptive message about your changes."
|
||||
```
|
||||
|
||||
코드를 정기적으로 원본 저장소와 동기화하는 것은 좋은 아이디어입니다. 이렇게 하면 변경 사항을 빠르게 반영할 수 있습니다:
|
||||
|
||||
```bash
|
||||
$ git pull upstream main
|
||||
```
|
||||
|
||||
변경 사항을 계정에 푸시하려면 다음을 사용하세요:
|
||||
|
||||
```bash
|
||||
$ git push -u origin a-descriptive-name-for-my-changes
|
||||
```
|
||||
|
||||
6. 만족하셨다면, GitHub에서 포크한 웹페이지로 이동하여 'Pull request'를 클릭하여 변경사항을 프로젝트 메인테이너에게 검토를 요청합니다.
|
||||
|
||||
7. 메인테이너가 변경 사항을 요청하는 것은 괜찮습니다. 핵심 기여자들에게도 일어나는 일입니다! 따라서 변경 사항을 Pull request에서 볼 수 있도록 로컬 브랜치에서 작업하고 변경 사항을 포크에 푸시하면 자동으로 Pull request에 나타납니다.
|
||||
|
||||
### 테스트
|
||||
|
||||
라이브러리 동작과 여러 예제를 테스트하기 위해 포괄적인 테스트 묶음이 포함되어 있습니다. 라이브러리 테스트는 [tests 폴더](https://github.com/huggingface/diffusers/tree/main/tests)에서 찾을 수 있습니다.
|
||||
|
||||
`pytest`와 `pytest-xdist`를 선호하는 이유는 더 빠르기 때문입니다. 루트 디렉토리에서 라이브러리를 위해 `pytest`로 테스트를 실행하는 방법은 다음과 같습니다:
|
||||
|
||||
```bash
|
||||
$ python -m pytest -n auto --dist=loadfile -s -v ./tests/
|
||||
```
|
||||
|
||||
사실, `make test`는 이렇게 구현되어 있습니다!
|
||||
|
||||
작업 중인 기능만 테스트하기 위해 더 작은 테스트 세트를 지정할 수 있습니다.
|
||||
|
||||
기본적으로 느린 테스트는 건너뜁니다. `RUN_SLOW` 환경 변수를 `yes`로 설정하여 실행할 수 있습니다. 이는 많은 기가바이트의 모델을 다운로드합니다. 충분한 디스크 공간과 좋은 인터넷 연결 또는 많은 인내심이 필요합니다!
|
||||
|
||||
```bash
|
||||
$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/
|
||||
```
|
||||
|
||||
`unittest`는 완전히 지원됩니다. 다음은 `unittest`를 사용하여 테스트를 실행하는 방법입니다:
|
||||
|
||||
```bash
|
||||
$ python -m unittest discover -s tests -t . -v
|
||||
$ python -m unittest discover -s examples -t examples -v
|
||||
```
|
||||
|
||||
### upstream(main)과 forked main 동기화하기
|
||||
|
||||
upstream 저장소에 불필요한 참조 노트를 추가하고 관련 개발자에게 알림을 보내는 것을 피하기 위해,
|
||||
forked 저장소의 main 브랜치를 동기화할 때 다음 단계를 따르세요:
|
||||
1. 가능한 경우, forked 저장소에서 브랜치와 PR을 사용하여 upstream과 동기화하는 것을 피하세요. 대신 forked main으로 직접 병합하세요.
|
||||
2. PR이 절대적으로 필요한 경우, 브랜치를 체크아웃한 후 다음 단계를 사용하세요:
|
||||
```bash
|
||||
$ git checkout -b your-branch-for-syncing
|
||||
$ git pull --squash --no-commit upstream main
|
||||
$ git commit -m '<your message without GitHub references>'
|
||||
$ git push --set-upstream origin your-branch-for-syncing
|
||||
```
|
||||
|
||||
### 스타일 가이드
|
||||
|
||||
Documentation string에 대해서는, 🧨 Diffusers는 [Google 스타일](https://google.github.io/styleguide/pyguide.html)을 따릅니다.
|
||||
63
docs/source/ko/conceptual/ethical_guidelines.md
Normal file
63
docs/source/ko/conceptual/ethical_guidelines.md
Normal file
@@ -0,0 +1,63 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# 🧨 Diffusers의 윤리 지침
|
||||
|
||||
## 서문
|
||||
|
||||
[Diffusers](https://huggingface.co/docs/diffusers/index)는 사전 훈련된 diffusion 모델을 제공하며 추론 및 훈련을 위한 모듈식 툴박스로 사용됩니다.
|
||||
|
||||
이 기술의 실제 적용과 사회에 미칠 수 있는 부정적인 영향을 고려하여 Diffusers 라이브러리의 개발, 사용자 기여 및 사용에 윤리 지침을 제공하는 것이 중요하다고 생각합니다.
|
||||
|
||||
이 기술을 사용하는 데 연관된 위험은 아직 조사 중이지만, 몇 가지 예를 들면: 예술가들에 대한 저작권 문제; 딥 페이크의 악용; 부적절한 맥락에서의 성적 콘텐츠 생성; 동의 없는 impersonation; 사회적인 편견으로 인해 억압되는 그룹들에 대한 해로운 영향입니다.
|
||||
우리는 위험을 지속적으로 추적하고 커뮤니티의 응답과 소중한 피드백에 따라 다음 지침을 조정할 것입니다.
|
||||
|
||||
|
||||
## 범위
|
||||
|
||||
Diffusers 커뮤니티는 프로젝트의 개발에 다음과 같은 윤리 지침을 적용하며, 특히 윤리적 문제와 관련된 민감한 주제에 대한 커뮤니티의 기여를 조정하는 데 도움을 줄 것입니다.
|
||||
|
||||
|
||||
## 윤리 지침
|
||||
|
||||
다음 윤리 지침은 일반적으로 적용되지만, 기술적 선택을 할 때 윤리적으로 민감한 문제를 다룰 때 주로 적용할 것입니다. 또한, 해당 기술의 최신 동향과 관련된 신규 위험에 따라 시간이 지남에 따라 이러한 윤리 원칙을 조정할 것을 약속합니다.
|
||||
|
||||
- **투명성**: 우리는 PR을 관리하고, 사용자에게 우리의 선택을 설명하며, 기술적 의사결정을 내릴 때 투명성을 유지할 것을 약속합니다.
|
||||
|
||||
- **일관성**: 우리는 프로젝트 관리에서 사용자들에게 동일한 수준의 관심을 보장하고 기술적으로 안정되고 일관된 상태를 유지할 것을 약속합니다.
|
||||
|
||||
- **간결성**: Diffusers 라이브러리를 사용하고 활용하기 쉽게 만들기 위해, 프로젝트의 목표를 간결하고 일관성 있게 유지할 것을 약속합니다.
|
||||
|
||||
- **접근성**: Diffusers 프로젝트는 기술적 전문 지식 없어도 프로젝트 운영에 참여할 수 있는 기여자의 진입장벽을 낮춥니다. 이를 통해 연구 결과물이 커뮤니티에 더 잘 접근할 수 있게 됩니다.
|
||||
|
||||
- **재현성**: 우리는 Diffusers 라이브러리를 통해 제공되는 업스트림(upstream) 코드, 모델 및 데이터셋의 재현성에 대해 투명하게 공개할 것을 목표로 합니다.
|
||||
|
||||
- **책임**: 우리는 커뮤니티와 팀워크를 통해, 이 기술의 잠재적인 위험과 위험을 예측하고 완화하는 데 대한 공동 책임을 가지고 있습니다.
|
||||
|
||||
|
||||
## 구현 사례: 안전 기능과 메커니즘
|
||||
|
||||
팀은 diffusion 기술과 관련된 잠재적인 윤리 및 사회적 위험에 대처하기 위한 기술적 및 비기술적 도구를 제공하고자 하고 있습니다. 또한, 커뮤니티의 참여는 이러한 기능의 구현하고 우리와 함께 인식을 높이는 데 매우 중요합니다.
|
||||
|
||||
- [**커뮤니티 탭**](https://huggingface.co/docs/hub/repositories-pull-requests-discussions): 이를 통해 커뮤니티는 프로젝트에 대해 토론하고 더 나은 협력을 할 수 있습니다.
|
||||
|
||||
- **편향 탐색 및 평가**: Hugging Face 팀은 Stable Diffusion 모델의 편향성을 대화형으로 보여주는 [space](https://huggingface.co/spaces/society-ethics/DiffusionBiasExplorer)을 제공합니다. 이런 의미에서, 우리는 편향 탐색 및 평가를 지원하고 장려합니다.
|
||||
|
||||
- **배포에서의 안전 유도**
|
||||
|
||||
- [**안전한 Stable Diffusion**](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_safe): 이는 필터되지 않은 웹 크롤링 데이터셋으로 훈련된 Stable Diffusion과 같은 모델이 부적절한 변질에 취약한 문제를 완화합니다. 관련 논문: [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://arxiv.org/abs/2211.05105).
|
||||
|
||||
- [**안전 검사기**](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py): 이미지가 생성된 후에 이미자가 임베딩 공간에서 일련의 하드코딩된 유해 개념의 클래스일 확률을 확인하고 비교합니다. 유해 개념은 역공학을 방지하기 위해 의도적으로 숨겨져 있습니다.
|
||||
|
||||
- **Hub에서의 단계적인 배포**: 특히 민감한 상황에서는 일부 리포지토리에 대한 접근을 제한해야 합니다. 이 단계적인 배포는 중간 단계로, 리포지토리 작성자가 사용에 대한 더 많은 통제력을 갖게 합니다.
|
||||
|
||||
- **라이선싱**: [OpenRAILs](https://huggingface.co/blog/open_rail)와 같은 새로운 유형의 라이선싱을 통해 자유로운 접근을 보장하면서도 더 책임 있는 사용을 위한 일련의 제한을 둘 수 있습니다.
|
||||
554
docs/source/ko/conceptual/evaluation.md
Normal file
554
docs/source/ko/conceptual/evaluation.md
Normal file
@@ -0,0 +1,554 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Diffusion 모델 평가하기[[evaluating-diffusion-models]]
|
||||
|
||||
<a target="_blank" href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/evaluation.ipynb">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||
</a>
|
||||
|
||||
[Stable Diffusion](https://huggingface.co/docs/diffusers/stable_diffusion)와 같은 생성 모델의 평가는 주관적인 성격을 가지고 있습니다. 그러나 실무자와 연구자로서 우리는 종종 다양한 가능성 중에서 신중한 선택을 해야 합니다. 그래서 다양한 생성 모델 (GAN, Diffusion 등)을 사용할 때 어떻게 선택해야 할까요?
|
||||
|
||||
정성적인 평가는 모델의 이미지 품질에 대한 주관적인 평가이므로 오류가 발생할 수 있고 결정에 잘못된 영향을 미칠 수 있습니다. 반면, 정량적인 평가는 이미지 품질과 직접적인 상관관계를 갖지 않을 수 있습니다. 따라서 일반적으로 정성적 평가와 정량적 평가를 모두 고려하는 것이 더 강력한 신호를 제공하여 모델 선택에 도움이 됩니다.
|
||||
|
||||
이 문서에서는 Diffusion 모델을 평가하기 위한 정성적 및 정량적 방법에 대해 상세히 설명합니다. 정량적 방법에 대해서는 특히 `diffusers`와 함께 구현하는 방법에 초점을 맞추었습니다.
|
||||
|
||||
이 문서에서 보여진 방법들은 기반 생성 모델을 고정시키고 다양한 [노이즈 스케줄러](https://huggingface.co/docs/diffusers/main/en/api/schedulers/overview)를 평가하는 데에도 사용할 수 있습니다.
|
||||
|
||||
## 시나리오[[scenarios]]
|
||||
다음과 같은 파이프라인을 사용하여 Diffusion 모델을 다룹니다:
|
||||
|
||||
- 텍스트로 안내된 이미지 생성 (예: [`StableDiffusionPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/text2img)).
|
||||
- 입력 이미지에 추가로 조건을 건 텍스트로 안내된 이미지 생성 (예: [`StableDiffusionImg2ImgPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/img2img) 및 [`StableDiffusionInstructPix2PixPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pix2pix)).
|
||||
- 클래스 조건화된 이미지 생성 모델 (예: [`DiTPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/dit)).
|
||||
|
||||
## 정성적 평가[[qualitative-evaluation]]
|
||||
|
||||
정성적 평가는 일반적으로 생성된 이미지의 인간 평가를 포함합니다. 품질은 구성성, 이미지-텍스트 일치, 공간 관계 등과 같은 측면에서 측정됩니다. 일반적인 프롬프트는 주관적인 지표에 대한 일정한 기준을 제공합니다.
|
||||
DrawBench와 PartiPrompts는 정성적인 벤치마킹에 사용되는 프롬프트 데이터셋입니다. DrawBench와 PartiPrompts는 각각 [Imagen](https://imagen.research.google/)과 [Parti](https://parti.research.google/)에서 소개되었습니다.
|
||||
|
||||
[Parti 공식 웹사이트](https://parti.research.google/)에서 다음과 같이 설명하고 있습니다:
|
||||
|
||||
> PartiPrompts (P2)는 이 작업의 일부로 공개되는 영어로 된 1600개 이상의 다양한 프롬프트 세트입니다. P2는 다양한 범주와 도전 측면에서 모델의 능력을 측정하는 데 사용할 수 있습니다.
|
||||
|
||||

|
||||
|
||||
PartiPrompts는 다음과 같은 열을 가지고 있습니다:
|
||||
|
||||
- 프롬프트 (Prompt)
|
||||
- 프롬프트의 카테고리 (예: "Abstract", "World Knowledge" 등)
|
||||
- 난이도를 반영한 챌린지 (예: "Basic", "Complex", "Writing & Symbols" 등)
|
||||
|
||||
이러한 벤치마크는 서로 다른 이미지 생성 모델을 인간 평가로 비교할 수 있도록 합니다.
|
||||
|
||||
이를 위해 🧨 Diffusers 팀은 **Open Parti Prompts**를 구축했습니다. 이는 Parti Prompts를 기반으로 한 커뮤니티 기반의 질적 벤치마크로, 최첨단 오픈 소스 확산 모델을 비교하는 데 사용됩니다:
|
||||
- [Open Parti Prompts 게임](https://huggingface.co/spaces/OpenGenAI/open-parti-prompts): 10개의 parti prompt에 대해 4개의 생성된 이미지가 제시되며, 사용자는 프롬프트에 가장 적합한 이미지를 선택합니다.
|
||||
- [Open Parti Prompts 리더보드](https://huggingface.co/spaces/OpenGenAI/parti-prompts-leaderboard): 현재 최고의 오픈 소스 diffusion 모델들을 서로 비교하는 리더보드입니다.
|
||||
|
||||
이미지를 수동으로 비교하려면, `diffusers`를 사용하여 몇가지 PartiPrompts를 어떻게 활용할 수 있는지 알아봅시다.
|
||||
|
||||
다음은 몇 가지 다른 도전에서 샘플링한 프롬프트를 보여줍니다: Basic, Complex, Linguistic Structures, Imagination, Writing & Symbols. 여기서는 PartiPrompts를 [데이터셋](https://huggingface.co/datasets/nateraw/parti-prompts)으로 사용합니다.
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
# prompts = load_dataset("nateraw/parti-prompts", split="train")
|
||||
# prompts = prompts.shuffle()
|
||||
# sample_prompts = [prompts[i]["Prompt"] for i in range(5)]
|
||||
|
||||
# Fixing these sample prompts in the interest of reproducibility.
|
||||
sample_prompts = [
|
||||
"a corgi",
|
||||
"a hot air balloon with a yin-yang symbol, with the moon visible in the daytime sky",
|
||||
"a car with no windows",
|
||||
"a cube made of porcupine",
|
||||
'The saying "BE EXCELLENT TO EACH OTHER" written on a red brick wall with a graffiti image of a green alien wearing a tuxedo. A yellow fire hydrant is on a sidewalk in the foreground.',
|
||||
]
|
||||
```
|
||||
이제 이런 프롬프트를 사용하여 Stable Diffusion ([v1-4 checkpoint](https://huggingface.co/CompVis/stable-diffusion-v1-4))를 사용한 이미지 생성을 할 수 있습니다 :
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
seed = 0
|
||||
generator = torch.manual_seed(seed)
|
||||
|
||||
images = sd_pipeline(sample_prompts, num_images_per_prompt=1, generator=generator).images
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
`num_images_per_prompt`를 설정하여 동일한 프롬프트에 대해 다른 이미지를 비교할 수도 있습니다. 다른 체크포인트([v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5))로 동일한 파이프라인을 실행하면 다음과 같은 결과가 나옵니다:
|
||||
|
||||

|
||||
|
||||
|
||||
다양한 모델을 사용하여 모든 프롬프트에서 생성된 여러 이미지들이 생성되면 (평가 과정에서) 이러한 결과물들은 사람 평가자들에게 점수를 매기기 위해 제시됩니다. DrawBench와 PartiPrompts 벤치마크에 대한 자세한 내용은 각각의 논문을 참조하십시오.
|
||||
|
||||
<Tip>
|
||||
|
||||
모델이 훈련 중일 때 추론 샘플을 살펴보는 것은 훈련 진행 상황을 측정하는 데 유용합니다. [훈련 스크립트](https://github.com/huggingface/diffusers/tree/main/examples/)에서는 TensorBoard와 Weights & Biases에 대한 추가 지원과 함께 이 유틸리티를 지원합니다.
|
||||
|
||||
</Tip>
|
||||
|
||||
## 정량적 평가[[quantitative-evaluation]]
|
||||
|
||||
이 섹션에서는 세 가지 다른 확산 파이프라인을 평가하는 방법을 안내합니다:
|
||||
|
||||
- CLIP 점수
|
||||
- CLIP 방향성 유사도
|
||||
- FID
|
||||
|
||||
### 텍스트 안내 이미지 생성[[text-guided-image-generation]]
|
||||
|
||||
[CLIP 점수](https://arxiv.org/abs/2104.08718)는 이미지-캡션 쌍의 호환성을 측정합니다. 높은 CLIP 점수는 높은 호환성🔼을 나타냅니다. CLIP 점수는 이미지와 캡션 사이의 의미적 유사성으로 생각할 수도 있습니다. CLIP 점수는 인간 판단과 높은 상관관계를 가지고 있습니다.
|
||||
|
||||
[`StableDiffusionPipeline`]을 일단 로드해봅시다:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
|
||||
model_ckpt = "CompVis/stable-diffusion-v1-4"
|
||||
sd_pipeline = StableDiffusionPipeline.from_pretrained(model_ckpt, torch_dtype=torch.float16).to("cuda")
|
||||
```
|
||||
|
||||
여러 개의 프롬프트를 사용하여 이미지를 생성합니다:
|
||||
|
||||
```python
|
||||
prompts = [
|
||||
"a photo of an astronaut riding a horse on mars",
|
||||
"A high tech solarpunk utopia in the Amazon rainforest",
|
||||
"A pikachu fine dining with a view to the Eiffel Tower",
|
||||
"A mecha robot in a favela in expressionist style",
|
||||
"an insect robot preparing a delicious meal",
|
||||
"A small cabin on top of a snowy mountain in the style of Disney, artstation",
|
||||
]
|
||||
|
||||
images = sd_pipeline(prompts, num_images_per_prompt=1, output_type="np").images
|
||||
|
||||
print(images.shape)
|
||||
# (6, 512, 512, 3)
|
||||
```
|
||||
|
||||
그러고 나서 CLIP 점수를 계산합니다.
|
||||
|
||||
```python
|
||||
from torchmetrics.functional.multimodal import clip_score
|
||||
from functools import partial
|
||||
|
||||
clip_score_fn = partial(clip_score, model_name_or_path="openai/clip-vit-base-patch16")
|
||||
|
||||
def calculate_clip_score(images, prompts):
|
||||
images_int = (images * 255).astype("uint8")
|
||||
clip_score = clip_score_fn(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
|
||||
return round(float(clip_score), 4)
|
||||
|
||||
sd_clip_score = calculate_clip_score(images, prompts)
|
||||
print(f"CLIP score: {sd_clip_score}")
|
||||
# CLIP score: 35.7038
|
||||
```
|
||||
|
||||
위의 예제에서는 각 프롬프트 당 하나의 이미지를 생성했습니다. 만약 프롬프트 당 여러 이미지를 생성한다면, 프롬프트 당 생성된 이미지의 평균 점수를 사용해야 합니다.
|
||||
|
||||
이제 [`StableDiffusionPipeline`]과 호환되는 두 개의 체크포인트를 비교하려면, 파이프라인을 호출할 때 generator를 전달해야 합니다. 먼저, 고정된 시드로 [v1-4 Stable Diffusion 체크포인트](https://huggingface.co/CompVis/stable-diffusion-v1-4)를 사용하여 이미지를 생성합니다:
|
||||
|
||||
```python
|
||||
seed = 0
|
||||
generator = torch.manual_seed(seed)
|
||||
|
||||
images = sd_pipeline(prompts, num_images_per_prompt=1, generator=generator, output_type="np").images
|
||||
```
|
||||
|
||||
그런 다음 [v1-5 checkpoint](https://huggingface.co/runwayml/stable-diffusion-v1-5)를 로드하여 이미지를 생성합니다:
|
||||
|
||||
```python
|
||||
model_ckpt_1_5 = "runwayml/stable-diffusion-v1-5"
|
||||
sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=weight_dtype).to(device)
|
||||
|
||||
images_1_5 = sd_pipeline_1_5(prompts, num_images_per_prompt=1, generator=generator, output_type="np").images
|
||||
```
|
||||
|
||||
그리고 마지막으로 CLIP 점수를 비교합니다:
|
||||
|
||||
```python
|
||||
sd_clip_score_1_4 = calculate_clip_score(images, prompts)
|
||||
print(f"CLIP Score with v-1-4: {sd_clip_score_1_4}")
|
||||
# CLIP Score with v-1-4: 34.9102
|
||||
|
||||
sd_clip_score_1_5 = calculate_clip_score(images_1_5, prompts)
|
||||
print(f"CLIP Score with v-1-5: {sd_clip_score_1_5}")
|
||||
# CLIP Score with v-1-5: 36.2137
|
||||
```
|
||||
|
||||
[v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) 체크포인트가 이전 버전보다 더 나은 성능을 보이는 것 같습니다. 그러나 CLIP 점수를 계산하기 위해 사용한 프롬프트의 수가 상당히 적습니다. 보다 실용적인 평가를 위해서는 이 수를 훨씬 높게 설정하고, 프롬프트를 다양하게 사용해야 합니다.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
이 점수에는 몇 가지 제한 사항이 있습니다. 훈련 데이터셋의 캡션은 웹에서 크롤링되어 이미지와 관련된 `alt` 및 유사한 태그에서 추출되었습니다. 이들은 인간이 이미지를 설명하는 데 사용할 수 있는 것과 일치하지 않을 수 있습니다. 따라서 여기서는 몇 가지 프롬프트를 "엔지니어링"해야 했습니다.
|
||||
|
||||
</Tip>
|
||||
|
||||
### 이미지 조건화된 텍스트-이미지 생성[[image-conditioned-text-to-image-generation]]
|
||||
|
||||
이 경우, 생성 파이프라인을 입력 이미지와 텍스트 프롬프트로 조건화합니다. [`StableDiffusionInstructPix2PixPipeline`]을 예로 들어보겠습니다. 이는 편집 지시문을 입력 프롬프트로 사용하고 편집할 입력 이미지를 사용합니다.
|
||||
|
||||
다음은 하나의 예시입니다:
|
||||
|
||||

|
||||
|
||||
모델을 평가하는 한 가지 전략은 두 이미지 캡션 간의 변경과([CLIP-Guided Domain Adaptation of Image Generators](https://arxiv.org/abs/2108.00946)에서 보여줍니다) 함께 두 이미지 사이의 변경의 일관성을 측정하는 것입니다 ([CLIP](https://huggingface.co/docs/transformers/model_doc/clip) 공간에서). 이를 "**CLIP 방향성 유사성**"이라고 합니다.
|
||||
|
||||
- 캡션 1은 편집할 이미지 (이미지 1)에 해당합니다.
|
||||
- 캡션 2는 편집된 이미지 (이미지 2)에 해당합니다. 편집 지시를 반영해야 합니다.
|
||||
|
||||
다음은 그림으로 된 개요입니다:
|
||||
|
||||

|
||||
|
||||
우리는 이 측정 항목을 구현하기 위해 미니 데이터 세트를 준비했습니다. 먼저 데이터 세트를 로드해 보겠습니다.
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset = load_dataset("sayakpaul/instructpix2pix-demo", split="train")
|
||||
dataset.features
|
||||
```
|
||||
|
||||
```bash
|
||||
{'input': Value(dtype='string', id=None),
|
||||
'edit': Value(dtype='string', id=None),
|
||||
'output': Value(dtype='string', id=None),
|
||||
'image': Image(decode=True, id=None)}
|
||||
```
|
||||
|
||||
여기에는 다음과 같은 항목이 있습니다:
|
||||
|
||||
- `input`은 `image`에 해당하는 캡션입니다.
|
||||
- `edit`은 편집 지시사항을 나타냅니다.
|
||||
- `output`은 `edit` 지시사항을 반영한 수정된 캡션입니다.
|
||||
|
||||
샘플을 살펴보겠습니다.
|
||||
|
||||
```python
|
||||
idx = 0
|
||||
print(f"Original caption: {dataset[idx]['input']}")
|
||||
print(f"Edit instruction: {dataset[idx]['edit']}")
|
||||
print(f"Modified caption: {dataset[idx]['output']}")
|
||||
```
|
||||
|
||||
```bash
|
||||
Original caption: 2. FAROE ISLANDS: An archipelago of 18 mountainous isles in the North Atlantic Ocean between Norway and Iceland, the Faroe Islands has 'everything you could hope for', according to Big 7 Travel. It boasts 'crystal clear waterfalls, rocky cliffs that seem to jut out of nowhere and velvety green hills'
|
||||
Edit instruction: make the isles all white marble
|
||||
Modified caption: 2. WHITE MARBLE ISLANDS: An archipelago of 18 mountainous white marble isles in the North Atlantic Ocean between Norway and Iceland, the White Marble Islands has 'everything you could hope for', according to Big 7 Travel. It boasts 'crystal clear waterfalls, rocky cliffs that seem to jut out of nowhere and velvety green hills'
|
||||
```
|
||||
|
||||
다음은 이미지입니다:
|
||||
|
||||
```python
|
||||
dataset[idx]["image"]
|
||||
```
|
||||
|
||||

|
||||
|
||||
먼저 편집 지시사항을 사용하여 데이터 세트의 이미지를 편집하고 방향 유사도를 계산합니다.
|
||||
|
||||
[`StableDiffusionInstructPix2PixPipeline`]를 먼저 로드합니다:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionInstructPix2PixPipeline
|
||||
|
||||
instruct_pix2pix_pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
|
||||
"timbrooks/instruct-pix2pix", torch_dtype=torch.float16
|
||||
).to(device)
|
||||
```
|
||||
|
||||
이제 편집을 수행합니다:
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
|
||||
def edit_image(input_image, instruction):
|
||||
image = instruct_pix2pix_pipeline(
|
||||
instruction,
|
||||
image=input_image,
|
||||
output_type="np",
|
||||
generator=generator,
|
||||
).images[0]
|
||||
return image
|
||||
|
||||
input_images = []
|
||||
original_captions = []
|
||||
modified_captions = []
|
||||
edited_images = []
|
||||
|
||||
for idx in range(len(dataset)):
|
||||
input_image = dataset[idx]["image"]
|
||||
edit_instruction = dataset[idx]["edit"]
|
||||
edited_image = edit_image(input_image, edit_instruction)
|
||||
|
||||
input_images.append(np.array(input_image))
|
||||
original_captions.append(dataset[idx]["input"])
|
||||
modified_captions.append(dataset[idx]["output"])
|
||||
edited_images.append(edited_image)
|
||||
```
|
||||
방향 유사도를 계산하기 위해서는 먼저 CLIP의 이미지와 텍스트 인코더를 로드합니다:
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
CLIPTokenizer,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPVisionModelWithProjection,
|
||||
CLIPImageProcessor,
|
||||
)
|
||||
|
||||
clip_id = "openai/clip-vit-large-patch14"
|
||||
tokenizer = CLIPTokenizer.from_pretrained(clip_id)
|
||||
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(device)
|
||||
image_processor = CLIPImageProcessor.from_pretrained(clip_id)
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device)
|
||||
```
|
||||
|
||||
주목할 점은 특정한 CLIP 체크포인트인 `openai/clip-vit-large-patch14`를 사용하고 있다는 것입니다. 이는 Stable Diffusion 사전 훈련이 이 CLIP 변형체와 함께 수행되었기 때문입니다. 자세한 내용은 [문서](https://huggingface.co/docs/transformers/model_doc/clip)를 참조하세요.
|
||||
|
||||
다음으로, 방향성 유사도를 계산하기 위해 PyTorch의 `nn.Module`을 준비합니다:
|
||||
|
||||
```python
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class DirectionalSimilarity(nn.Module):
|
||||
def __init__(self, tokenizer, text_encoder, image_processor, image_encoder):
|
||||
super().__init__()
|
||||
self.tokenizer = tokenizer
|
||||
self.text_encoder = text_encoder
|
||||
self.image_processor = image_processor
|
||||
self.image_encoder = image_encoder
|
||||
|
||||
def preprocess_image(self, image):
|
||||
image = self.image_processor(image, return_tensors="pt")["pixel_values"]
|
||||
return {"pixel_values": image.to(device)}
|
||||
|
||||
def tokenize_text(self, text):
|
||||
inputs = self.tokenizer(
|
||||
text,
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
return {"input_ids": inputs.input_ids.to(device)}
|
||||
|
||||
def encode_image(self, image):
|
||||
preprocessed_image = self.preprocess_image(image)
|
||||
image_features = self.image_encoder(**preprocessed_image).image_embeds
|
||||
image_features = image_features / image_features.norm(dim=1, keepdim=True)
|
||||
return image_features
|
||||
|
||||
def encode_text(self, text):
|
||||
tokenized_text = self.tokenize_text(text)
|
||||
text_features = self.text_encoder(**tokenized_text).text_embeds
|
||||
text_features = text_features / text_features.norm(dim=1, keepdim=True)
|
||||
return text_features
|
||||
|
||||
def compute_directional_similarity(self, img_feat_one, img_feat_two, text_feat_one, text_feat_two):
|
||||
sim_direction = F.cosine_similarity(img_feat_two - img_feat_one, text_feat_two - text_feat_one)
|
||||
return sim_direction
|
||||
|
||||
def forward(self, image_one, image_two, caption_one, caption_two):
|
||||
img_feat_one = self.encode_image(image_one)
|
||||
img_feat_two = self.encode_image(image_two)
|
||||
text_feat_one = self.encode_text(caption_one)
|
||||
text_feat_two = self.encode_text(caption_two)
|
||||
directional_similarity = self.compute_directional_similarity(
|
||||
img_feat_one, img_feat_two, text_feat_one, text_feat_two
|
||||
)
|
||||
return directional_similarity
|
||||
```
|
||||
|
||||
이제 `DirectionalSimilarity`를 사용해 보겠습니다.
|
||||
|
||||
```python
|
||||
dir_similarity = DirectionalSimilarity(tokenizer, text_encoder, image_processor, image_encoder)
|
||||
scores = []
|
||||
|
||||
for i in range(len(input_images)):
|
||||
original_image = input_images[i]
|
||||
original_caption = original_captions[i]
|
||||
edited_image = edited_images[i]
|
||||
modified_caption = modified_captions[i]
|
||||
|
||||
similarity_score = dir_similarity(original_image, edited_image, original_caption, modified_caption)
|
||||
scores.append(float(similarity_score.detach().cpu()))
|
||||
|
||||
print(f"CLIP directional similarity: {np.mean(scores)}")
|
||||
# CLIP directional similarity: 0.0797976553440094
|
||||
```
|
||||
|
||||
CLIP 점수와 마찬가지로, CLIP 방향 유사성이 높을수록 좋습니다.
|
||||
|
||||
`StableDiffusionInstructPix2PixPipeline`은 `image_guidance_scale`과 `guidance_scale`이라는 두 가지 인자를 노출시킵니다. 이 두 인자를 조정하여 최종 편집된 이미지의 품질을 제어할 수 있습니다. 이 두 인자의 영향을 실험해보고 방향 유사성에 미치는 영향을 확인해보기를 권장합니다.
|
||||
|
||||
이러한 메트릭의 개념을 확장하여 원본 이미지와 편집된 버전의 유사성을 측정할 수 있습니다. 이를 위해 `F.cosine_similarity(img_feat_two, img_feat_one)`을 사용할 수 있습니다. 이러한 종류의 편집에서는 이미지의 주요 의미가 최대한 보존되어야 합니다. 즉, 높은 유사성 점수를 얻어야 합니다.
|
||||
|
||||
[`StableDiffusionPix2PixZeroPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pix2pix_zero#diffusers.StableDiffusionPix2PixZeroPipeline)와 같은 유사한 파이프라인에도 이러한 메트릭을 사용할 수 있습니다.
|
||||
|
||||
<Tip>
|
||||
|
||||
CLIP 점수와 CLIP 방향 유사성 모두 CLIP 모델에 의존하기 때문에 평가가 편향될 수 있습니다
|
||||
|
||||
</Tip>
|
||||
|
||||
***IS, FID (나중에 설명할 예정), 또는 KID와 같은 메트릭을 확장하는 것은 어려울 수 있습니다***. 평가 중인 모델이 대규모 이미지 캡셔닝 데이터셋 (예: [LAION-5B 데이터셋](https://laion.ai/blog/laion-5b/))에서 사전 훈련되었을 때 이는 문제가 될 수 있습니다. 왜냐하면 이러한 메트릭의 기반에는 중간 이미지 특징을 추출하기 위해 ImageNet-1k 데이터셋에서 사전 훈련된 InceptionNet이 사용되기 때문입니다. Stable Diffusion의 사전 훈련 데이터셋은 InceptionNet의 사전 훈련 데이터셋과 겹치는 부분이 제한적일 수 있으므로 따라서 여기에는 좋은 후보가 아닙니다.
|
||||
|
||||
***위의 메트릭을 사용하면 클래스 조건이 있는 모델을 평가할 수 있습니다. 예를 들어, [DiT](https://huggingface.co/docs/diffusers/main/en/api/pipelines/dit). 이는 ImageNet-1k 클래스에 조건을 걸고 사전 훈련되었습니다.***
|
||||
|
||||
### 클래스 조건화 이미지 생성[[class-conditioned-image-generation]]
|
||||
|
||||
클래스 조건화 생성 모델은 일반적으로 [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k)와 같은 클래스 레이블이 지정된 데이터셋에서 사전 훈련됩니다. 이러한 모델을 평가하는 인기있는 지표에는 Fréchet Inception Distance (FID), Kernel Inception Distance (KID) 및 Inception Score (IS)가 있습니다. 이 문서에서는 FID ([Heusel et al.](https://arxiv.org/abs/1706.08500))에 초점을 맞추고 있습니다. [`DiTPipeline`](https://huggingface.co/docs/diffusers/api/pipelines/dit)을 사용하여 FID를 계산하는 방법을 보여줍니다. 이는 내부적으로 [DiT 모델](https://arxiv.org/abs/2212.09748)을 사용합니다.
|
||||
|
||||
FID는 두 개의 이미지 데이터셋이 얼마나 유사한지를 측정하는 것을 목표로 합니다. [이 자료](https://mmgeneration.readthedocs.io/en/latest/quick_run.html#fid)에 따르면:
|
||||
|
||||
> Fréchet Inception Distance는 두 개의 이미지 데이터셋 간의 유사성을 측정하는 지표입니다. 시각적 품질에 대한 인간 판단과 잘 상관되는 것으로 나타났으며, 주로 생성적 적대 신경망의 샘플 품질을 평가하는 데 사용됩니다. FID는 Inception 네트워크의 특징 표현에 맞게 적합한 두 개의 가우시안 사이의 Fréchet 거리를 계산하여 구합니다.
|
||||
|
||||
이 두 개의 데이터셋은 실제 이미지 데이터셋과 가짜 이미지 데이터셋(우리의 경우 생성된 이미지)입니다. FID는 일반적으로 두 개의 큰 데이터셋으로 계산됩니다. 그러나 이 문서에서는 두 개의 미니 데이터셋으로 작업할 것입니다.
|
||||
|
||||
먼저 ImageNet-1k 훈련 세트에서 몇 개의 이미지를 다운로드해 봅시다:
|
||||
|
||||
```python
|
||||
from zipfile import ZipFile
|
||||
import requests
|
||||
|
||||
|
||||
def download(url, local_filepath):
|
||||
r = requests.get(url)
|
||||
with open(local_filepath, "wb") as f:
|
||||
f.write(r.content)
|
||||
return local_filepath
|
||||
|
||||
dummy_dataset_url = "https://hf.co/datasets/sayakpaul/sample-datasets/resolve/main/sample-imagenet-images.zip"
|
||||
local_filepath = download(dummy_dataset_url, dummy_dataset_url.split("/")[-1])
|
||||
|
||||
with ZipFile(local_filepath, "r") as zipper:
|
||||
zipper.extractall(".")
|
||||
```
|
||||
|
||||
```python
|
||||
from PIL import Image
|
||||
import os
|
||||
|
||||
dataset_path = "sample-imagenet-images"
|
||||
image_paths = sorted([os.path.join(dataset_path, x) for x in os.listdir(dataset_path)])
|
||||
|
||||
real_images = [np.array(Image.open(path).convert("RGB")) for path in image_paths]
|
||||
```
|
||||
|
||||
다음은 ImageNet-1k classes의 이미지 10개입니다 : "cassette_player", "chain_saw" (x2), "church", "gas_pump" (x3), "parachute" (x2), 그리고 "tench".
|
||||
|
||||
<p align="center">
|
||||
<img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/evaluation_diffusion_models/real-images.png" alt="real-images"><br>
|
||||
<em>Real images.</em>
|
||||
</p>
|
||||
|
||||
이제 이미지가 로드되었으므로 이미지에 가벼운 전처리를 적용하여 FID 계산에 사용해 보겠습니다.
|
||||
|
||||
```python
|
||||
from torchvision.transforms import functional as F
|
||||
|
||||
|
||||
def preprocess_image(image):
|
||||
image = torch.tensor(image).unsqueeze(0)
|
||||
image = image.permute(0, 3, 1, 2) / 255.0
|
||||
return F.center_crop(image, (256, 256))
|
||||
|
||||
real_images = torch.cat([preprocess_image(image) for image in real_images])
|
||||
print(real_images.shape)
|
||||
# torch.Size([10, 3, 256, 256])
|
||||
```
|
||||
|
||||
이제 위에서 언급한 클래스에 따라 조건화 된 이미지를 생성하기 위해 [`DiTPipeline`](https://huggingface.co/docs/diffusers/api/pipelines/dit)를 로드합니다.
|
||||
|
||||
```python
|
||||
from diffusers import DiTPipeline, DPMSolverMultistepScheduler
|
||||
|
||||
dit_pipeline = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
|
||||
dit_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(dit_pipeline.scheduler.config)
|
||||
dit_pipeline = dit_pipeline.to("cuda")
|
||||
|
||||
words = [
|
||||
"cassette player",
|
||||
"chainsaw",
|
||||
"chainsaw",
|
||||
"church",
|
||||
"gas pump",
|
||||
"gas pump",
|
||||
"gas pump",
|
||||
"parachute",
|
||||
"parachute",
|
||||
"tench",
|
||||
]
|
||||
|
||||
class_ids = dit_pipeline.get_label_ids(words)
|
||||
output = dit_pipeline(class_labels=class_ids, generator=generator, output_type="np")
|
||||
|
||||
fake_images = output.images
|
||||
fake_images = torch.tensor(fake_images)
|
||||
fake_images = fake_images.permute(0, 3, 1, 2)
|
||||
print(fake_images.shape)
|
||||
# torch.Size([10, 3, 256, 256])
|
||||
```
|
||||
|
||||
이제 [`torchmetrics`](https://torchmetrics.readthedocs.io/)를 사용하여 FID를 계산할 수 있습니다.
|
||||
|
||||
```python
|
||||
from torchmetrics.image.fid import FrechetInceptionDistance
|
||||
|
||||
fid = FrechetInceptionDistance(normalize=True)
|
||||
fid.update(real_images, real=True)
|
||||
fid.update(fake_images, real=False)
|
||||
|
||||
print(f"FID: {float(fid.compute())}")
|
||||
# FID: 177.7147216796875
|
||||
```
|
||||
|
||||
FID는 낮을수록 좋습니다. 여러 가지 요소가 FID에 영향을 줄 수 있습니다:
|
||||
|
||||
- 이미지의 수 (실제 이미지와 가짜 이미지 모두)
|
||||
- diffusion 과정에서 발생하는 무작위성
|
||||
- diffusion 과정에서의 추론 단계 수
|
||||
- diffusion 과정에서 사용되는 스케줄러
|
||||
|
||||
마지막 두 가지 요소에 대해서는, 다른 시드와 추론 단계에서 평가를 실행하고 평균 결과를 보고하는 것은 좋은 실천 방법입니다
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
FID 결과는 많은 요소에 의존하기 때문에 취약할 수 있습니다:
|
||||
|
||||
* 계산 중 사용되는 특정 Inception 모델.
|
||||
* 계산의 구현 정확도.
|
||||
* 이미지 형식 (PNG 또는 JPG에서 시작하는 경우가 다릅니다).
|
||||
|
||||
이러한 사항을 염두에 두면, FID는 유사한 실행을 비교할 때 가장 유용하지만, 저자가 FID 측정 코드를 주의 깊게 공개하지 않는 한 논문 결과를 재현하기는 어렵습니다.
|
||||
|
||||
이러한 사항은 KID 및 IS와 같은 다른 관련 메트릭에도 적용됩니다.
|
||||
|
||||
</Tip>
|
||||
|
||||
마지막 단계로, `fake_images`를 시각적으로 검사해 봅시다.
|
||||
|
||||
<p align="center">
|
||||
<img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/evaluation_diffusion_models/fake-images.png" alt="fake-images"><br>
|
||||
<em>Fake images.</em>
|
||||
</p>
|
||||
103
docs/source/ko/conceptual/philosophy.md
Normal file
103
docs/source/ko/conceptual/philosophy.md
Normal file
@@ -0,0 +1,103 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# 철학
|
||||
|
||||
🧨 Diffusers는 다양한 모달리티에서 **최신의** 사전 훈련된 diffusion 모델을 제공합니다.
|
||||
그 목적은 추론과 훈련을 위한 **모듈식 툴박스**로 사용되는 것입니다.
|
||||
|
||||
우리는 오랜 시간에 견딜 수 있는 라이브러리를 구축하는 것을 목표로 하고, 따라서 API 설계를 매우 중요시합니다.
|
||||
|
||||
간단히 말해서, Diffusers는 PyTorch의 자연스러운 확장이 되도록 구축되었습니다. 따라서 대부분의 설계 선택은 [PyTorch의 설계 원칙](https://pytorch.org/docs/stable/community/design.html#pytorch-design-philosophy)에 기반합니다. 이제 가장 중요한 것들을 살펴보겠습니다:
|
||||
|
||||
## 성능보다는 사용성을
|
||||
|
||||
- Diffusers는 많은 내장 성능 향상 기능을 갖고 있지만 (자세한 내용은 [메모리와 속도](https://huggingface.co/docs/diffusers/optimization/fp16) 참조), 모델은 항상 가장 높은 정밀도와 최소한의 최적화로 로드됩니다. 따라서 기본적인 diffusion 파이프라인은 따로 정의하지 않는다면 CPU에서 float32 정밀도로 인스턴스화됩니다. 이는 다양한 플랫폼과 가속기에서의 사용성을 보장하며, 라이브러리를 실행하기 위해 복잡한 설치가 필요하지 않음을 의미합니다.
|
||||
- Diffusers는 **가벼운** 패키지를 지향하기 때문에 필수 종속성은 거의 없지만 성능을 향상시킬 수 있는 많은 선택적 종속성이 있습니다 (`accelerate`, `safetensors`, `onnx` 등). 저희는 라이브러리를 가능한 한 가볍게 유지하여 다른 패키지에 대한 종속성 걱정이 없도록 노력하고 있습니다.
|
||||
- Diffusers는 간결하고 이해하기 쉬운 코드를 선호합니다. 이는 람다 함수나 고급 PyTorch 연산자와 같은 압축된 코드 구문을 자주 사용하지 않는 것을 의미합니다.
|
||||
|
||||
## 쉬움보다는 간단함을
|
||||
|
||||
PyTorch에서는 **명시적인 것이 암시적인 것보다 낫다**와 **단순한 것이 복잡한 것보다 낫다**라고 말합니다. 이 설계 철학은 라이브러리의 여러 부분에 반영되어 있습니다:
|
||||
- [`DiffusionPipeline.to`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.to)와 같은 메서드를 사용하여 사용자가 장치 관리를 할 수 있도록 PyTorch의 API를 따릅니다.
|
||||
- 잘못된 입력을 조용히 수정하는 대신 간결한 오류 메시지를 발생시키는 것이 우선입니다. Diffusers는 라이브러리를 가능한 한 쉽게 사용할 수 있도록 하는 것보다 사용자를 가르치는 것을 목표로 합니다.
|
||||
- 복잡한 모델과 스케줄러 로직이 내부에서 마법처럼 처리하는 대신 노출됩니다. 스케줄러/샘플러는 서로에게 최소한의 종속성을 가지고 분리되어 있습니다. 이로써 사용자는 언롤된 노이즈 제거 루프를 작성해야 합니다. 그러나 이 분리는 디버깅을 더 쉽게하고 노이즈 제거 과정을 조정하거나 diffusers 모델이나 스케줄러를 교체하는 데 사용자에게 더 많은 제어권을 제공합니다.
|
||||
- diffusers 파이프라인의 따로 훈련된 구성 요소인 text encoder, unet 및 variational autoencoder는 각각 자체 모델 클래스를 갖습니다. 이로써 사용자는 서로 다른 모델의 구성 요소 간의 상호 작용을 처리해야 하며, 직렬화 형식은 모델 구성 요소를 다른 파일로 분리합니다. 그러나 이는 디버깅과 커스터마이징을 더 쉽게합니다. DreamBooth나 Textual Inversion 훈련은 Diffusers의 'diffusion 파이프라인의 단일 구성 요소들을 분리할 수 있는 능력' 덕분에 매우 간단합니다.
|
||||
|
||||
## 추상화보다는 수정 가능하고 기여하기 쉬움을
|
||||
|
||||
라이브러리의 대부분에 대해 Diffusers는 [Transformers 라이브러리](https://github.com/huggingface/transformers)의 중요한 설계 원칙을 채택합니다, 바로 성급한 추상화보다는 copy-pasted 코드를 선호한다는 것입니다. 이 설계 원칙은 [Don't repeat yourself (DRY)](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself)와 같은 인기 있는 설계 원칙과는 대조적으로 매우 의견이 분분한데요.
|
||||
간단히 말해서, Transformers가 모델링 파일에 대해 수행하는 것처럼, Diffusers는 매우 낮은 수준의 추상화와 매우 독립적인 코드를 유지하는 것을 선호합니다. 함수, 긴 코드 블록, 심지어 클래스도 여러 파일에 복사할 수 있으며, 이는 처음에는 라이브러리를 유지할 수 없게 만드는 나쁜, 서투른 설계 선택으로 보일 수 있습니다. 하지만 이러한 설계는 매우 성공적이며, 커뮤니티 기반의 오픈 소스 기계 학습 라이브러리에 매우 적합합니다. 그 이유는 다음과 같습니다:
|
||||
- 기계 학습은 패러다임, 모델 아키텍처 및 알고리즘이 빠르게 변화하는 매우 빠르게 움직이는 분야이기 때문에 오랜 기간 지속되는 코드 추상화를 정의하기가 매우 어렵습니다.
|
||||
- 기계 학습 전문가들은 아이디어와 연구를 위해 기존 코드를 빠르게 조정할 수 있어야 하므로, 많은 추상화보다는 독립적인 코드를 선호합니다.
|
||||
- 오픈 소스 라이브러리는 커뮤니티 기여에 의존하므로, 기여하기 쉬운 라이브러리를 구축해야 합니다. 코드가 추상화되면 의존성이 많아지고 읽기 어렵고 기여하기 어려워집니다. 기여자들은 중요한 기능을 망가뜨릴까 두려워하여 매우 추상화된 라이브러리에 기여하지 않게 됩니다. 라이브러리에 기여하는 것이 다른 기본 코드를 망가뜨릴 수 없다면, 잠재적인 새로운 기여자에게 더욱 환영받을 수 있을 뿐만 아니라 여러 부분에 대해 병렬적으로 검토하고 기여하기가 더 쉬워집니다.
|
||||
|
||||
Hugging Face에서는 이 설계를 **단일 파일 정책**이라고 부르며, 특정 클래스의 대부분의 코드가 단일하고 독립적인 파일에 작성되어야 한다는 의미입니다. 철학에 대해 자세히 알아보려면 [이 블로그 글](https://huggingface.co/blog/transformers-design-philosophy)을 참조할 수 있습니다.
|
||||
|
||||
Diffusers에서는 이러한 철학을 파이프라인과 스케줄러에 모두 따르지만, diffusion 모델에 대해서는 일부만 따릅니다. 일부만 따르는 이유는 Diffusion 파이프라인인 [DDPM](https://huggingface.co/docs/diffusers/api/pipelines/ddpm), [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview#stable-diffusion-pipelines), [unCLIP (DALL·E 2)](https://huggingface.co/docs/diffusers/api/pipelines/unclip) 및 [Imagen](https://imagen.research.google/) 등 대부분의 diffusion 파이프라인은 동일한 diffusion 모델인 [UNet](https://huggingface.co/docs/diffusers/api/models/unet2d-cond)에 의존하기 때문입니다.
|
||||
|
||||
좋아요, 이제 🧨 Diffusers가 설계된 방식을 대략적으로 이해했을 것입니다 🤗.
|
||||
우리는 이러한 설계 원칙을 일관되게 라이브러리 전체에 적용하려고 노력하고 있습니다. 그럼에도 불구하고 철학에 대한 일부 예외 사항이나 불행한 설계 선택이 있을 수 있습니다. 디자인에 대한 피드백이 있다면 [GitHub에서 직접](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=) 알려주시면 감사하겠습니다.
|
||||
|
||||
## 디자인 철학 자세히 알아보기
|
||||
|
||||
이제 디자인 철학의 세부 사항을 좀 더 자세히 살펴보겠습니다. Diffusers는 주로 세 가지 주요 클래스로 구성됩니다: [파이프라인](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines), [모델](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models), 그리고 [스케줄러](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers). 각 클래스에 대한 더 자세한 설계 결정 사항을 살펴보겠습니다.
|
||||
|
||||
### 파이프라인
|
||||
|
||||
파이프라인은 사용하기 쉽도록 설계되었으며 (따라서 [*쉬움보다는 간단함을*](#쉬움보다는-간단함을)을 100% 따르지는 않음), feature-complete하지 않으며, 추론을 위한 [모델](#모델)과 [스케줄러](#스케줄러)를 사용하는 방법의 예시로 간주될 수 있습니다.
|
||||
|
||||
다음과 같은 설계 원칙을 따릅니다:
|
||||
- 파이프라인은 단일 파일 정책을 따릅니다. 모든 파이프라인은 src/diffusers/pipelines의 개별 디렉토리에 있습니다. 하나의 파이프라인 폴더는 하나의 diffusion 논문/프로젝트/릴리스에 해당합니다. 여러 파이프라인 파일은 하나의 파이프라인 폴더에 모을 수 있습니다. 예를 들어 [`src/diffusers/pipelines/stable-diffusion`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion)에서 그렇게 하고 있습니다. 파이프라인이 유사한 기능을 공유하는 경우, [#Copied from mechanism](https://github.com/huggingface/diffusers/blob/125d783076e5bd9785beb05367a2d2566843a271/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L251)을 사용할 수 있습니다.
|
||||
- 파이프라인은 모두 [`DiffusionPipeline`]을 상속합니다.
|
||||
- 각 파이프라인은 서로 다른 모델 및 스케줄러 구성 요소로 구성되어 있으며, 이는 [`model_index.json` 파일](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json)에 문서화되어 있으며, 파이프라인의 속성 이름과 동일한 이름으로 액세스할 수 있으며, [`DiffusionPipeline.components`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.components) 함수를 통해 파이프라인 간에 공유할 수 있습니다.
|
||||
- 각 파이프라인은 [`DiffusionPipeline.from_pretrained`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained) 함수를 통해 로드할 수 있어야 합니다.
|
||||
- 파이프라인은 추론에**만** 사용되어야 합니다.
|
||||
- 파이프라인은 매우 가독성이 좋고, 이해하기 쉽고, 쉽게 조정할 수 있도록 설계되어야 합니다.
|
||||
- 파이프라인은 서로 상호작용하고, 상위 수준 API에 쉽게 통합할 수 있도록 설계되어야 합니다.
|
||||
- 파이프라인은 사용자 인터페이스가 feature-complete하지 않게 하는 것을 목표로 합니다. future-complete한 사용자 인터페이스를 원한다면 [InvokeAI](https://github.com/invoke-ai/InvokeAI), [Diffuzers](https://github.com/abhishekkrthakur/diffuzers), [lama-cleaner](https://github.com/Sanster/lama-cleaner)를 참조해야 합니다.
|
||||
- 모든 파이프라인은 오로지 `__call__` 메서드를 통해 실행할 수 있어야 합니다. `__call__` 인자의 이름은 모든 파이프라인에서 공유되어야 합니다.
|
||||
- 파이프라인은 해결하고자 하는 작업의 이름으로 지정되어야 합니다.
|
||||
- 대부분의 경우에 새로운 diffusion 파이프라인은 새로운 파이프라인 폴더/파일에 구현되어야 합니다.
|
||||
|
||||
### 모델
|
||||
|
||||
모델은 [PyTorch의 Module 클래스](https://pytorch.org/docs/stable/generated/torch.nn.Module.html)의 자연스러운 확장이 되도록, 구성 가능한 툴박스로 설계되었습니다. 그리고 모델은 **단일 파일 정책**을 일부만 따릅니다.
|
||||
|
||||
다음과 같은 설계 원칙을 따릅니다:
|
||||
- 모델은 **모델 아키텍처 유형**에 해당합니다. 예를 들어 [`UNet2DConditionModel`] 클래스는 2D 이미지 입력을 기대하고 일부 context에 의존하는 모든 UNet 변형들에 사용됩니다.
|
||||
- 모든 모델은 [`src/diffusers/models`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models)에서 찾을 수 있으며, 각 모델 아키텍처는 해당 파일에 정의되어야 합니다. 예를 들어 [`unet_2d_condition.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py), [`transformer_2d.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformer_2d.py) 등이 있습니다.
|
||||
- 모델은 **단일 파일 정책**을 따르지 않으며, [`attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py), [`resnet.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py), [`embeddings.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py) 등과 같은 작은 모델 구성 요소를 사용해야 합니다. **참고**: 이는 Transformers의 모델링 파일과는 대조적으로 모델이 실제로 단일 파일 정책을 따르지 않음을 보여줍니다.
|
||||
- 모델은 PyTorch의 `Module` 클래스와 마찬가지로 복잡성을 노출하고 명확한 오류 메시지를 제공해야 합니다.
|
||||
- 모든 모델은 `ModelMixin`과 `ConfigMixin`을 상속합니다.
|
||||
- 모델은 주요 코드 변경이 필요하지 않고, 역호환성을 유지하며, 메모리 또는 컴퓨팅과 관련한 중요한 이득을 제공할 때 성능을 위해 최적화할 수 있습니다.
|
||||
- 모델은 기본적으로 가장 높은 정밀도와 가장 낮은 성능 설정을 가져야 합니다.
|
||||
- Diffusers에 이미 있는 모델 아키텍처로 분류할 수 있는 새로운 모델 체크포인트를 통합할 때는 기존 모델 아키텍처를 새로운 체크포인트와 호환되도록 수정해야 합니다. 새로운 파일을 만들어야 하는 경우는 모델 아키텍처가 근본적으로 다른 경우에만 해당합니다.
|
||||
- 모델은 미래의 변경 사항을 쉽게 확장할 수 있도록 설계되어야 합니다. 이는 공개 함수 인수들과 구성 인수들을 제한하고,미래의 변경 사항을 "예상"하는 것을 통해 달성할 수 있습니다. 예를 들어, 불리언 `is_..._type` 인수보다는 새로운 미래 유형에 쉽게 확장할 수 있는 문자열 "...type" 인수를 추가하는 것이 일반적으로 더 좋습니다. 새로운 모델 체크포인트가 작동하도록 하기 위해 기존 아키텍처에 최소한의 변경만을 가해야 합니다.
|
||||
- 모델 디자인은 코드의 가독성과 간결성을 유지하는 것과 많은 모델 체크포인트를 지원하는 것 사이의 어려운 균형 조절입니다. 모델링 코드의 대부분은 새로운 모델 체크포인트를 위해 클래스를 수정하는 것이 좋지만, [UNet 블록](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py) 및 [Attention 프로세서](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py)와 같이 코드를 장기적으로 간결하고 읽기 쉽게 유지하기 위해 새로운 클래스를 추가하는 예외도 있습니다.
|
||||
|
||||
### 스케줄러
|
||||
|
||||
스케줄러는 추론을 위한 노이즈 제거 과정을 안내하고 훈련을 위한 노이즈 스케줄을 정의하는 역할을 합니다. 스케줄러는 개별 클래스로 설계되어 있으며, 로드 가능한 구성 파일과 **단일 파일 정책**을 엄격히 따릅니다.
|
||||
|
||||
다음과 같은 설계 원칙을 따릅니다:
|
||||
- 모든 스케줄러는 [`src/diffusers/schedulers`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers)에서 찾을 수 있습니다.
|
||||
- 스케줄러는 큰 유틸리티 파일에서 가져오지 **않아야** 하며, 자체 포함성을 유지해야 합니다.
|
||||
- 하나의 스케줄러 Python 파일은 하나의 스케줄러 알고리즘(논문에서 정의된 것과 같은)에 해당합니다.
|
||||
- 스케줄러가 유사한 기능을 공유하는 경우, `#Copied from` 메커니즘을 사용할 수 있습니다.
|
||||
- 모든 스케줄러는 `SchedulerMixin`과 `ConfigMixin`을 상속합니다.
|
||||
- [`ConfigMixin.from_config`](https://huggingface.co/docs/diffusers/main/en/api/configuration#diffusers.ConfigMixin.from_config) 메서드를 사용하여 스케줄러를 쉽게 교체할 수 있습니다. 자세한 내용은 [여기](../using-diffusers/schedulers.md)에서 설명합니다.
|
||||
- 모든 스케줄러는 `set_num_inference_steps`와 `step` 함수를 가져야 합니다. `set_num_inference_steps(...)`는 각 노이즈 제거 과정(즉, `step(...)`이 호출되기 전) 이전에 호출되어야 합니다.
|
||||
- 각 스케줄러는 모델이 호출될 타임스텝의 배열인 `timesteps` 속성을 통해 루프를 돌 수 있는 타임스텝을 노출합니다.
|
||||
- `step(...)` 함수는 예측된 모델 출력과 "현재" 샘플(x_t)을 입력으로 받고, "이전" 약간 더 노이즈가 제거된 샘플(x_t-1)을 반환합니다.
|
||||
- 노이즈 제거 스케줄러의 복잡성을 고려하여, `step` 함수는 모든 복잡성을 노출하지 않으며, "블랙 박스"일 수 있습니다.
|
||||
- 거의 모든 경우에 새로운 스케줄러는 새로운 스케줄링 파일에 구현되어야 합니다.
|
||||
@@ -93,13 +93,13 @@ cd diffusers
|
||||
|
||||
**PyTorch의 경우**
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install -e ".[torch]"
|
||||
```
|
||||
|
||||
**Flax의 경우**
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install -e ".[flax]"
|
||||
```
|
||||
|
||||
|
||||
@@ -19,13 +19,13 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
다음 명령어로 ONNX Runtime를 지원하는 🤗 Optimum를 설치합니다:
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install optimum["onnxruntime"]
|
||||
```
|
||||
|
||||
## Stable Diffusion 추론
|
||||
|
||||
아래 코드는 ONNX 런타임을 사용하는 방법을 보여줍니다. `StableDiffusionPipeline` 대신 `OnnxStableDiffusionPipeline`을 사용해야 합니다.
|
||||
아래 코드는 ONNX 런타임을 사용하는 방법을 보여줍니다. `StableDiffusionPipeline` 대신 `OnnxStableDiffusionPipeline`을 사용해야 합니다.
|
||||
PyTorch 모델을 불러오고 즉시 ONNX 형식으로 변환하려는 경우 `export=True`로 설정합니다.
|
||||
|
||||
```python
|
||||
@@ -38,7 +38,7 @@ images = pipe(prompt).images[0]
|
||||
pipe.save_pretrained("./onnx-stable-diffusion-v1-5")
|
||||
```
|
||||
|
||||
파이프라인을 ONNX 형식으로 오프라인으로 내보내고 나중에 추론에 사용하려는 경우,
|
||||
파이프라인을 ONNX 형식으로 오프라인으로 내보내고 나중에 추론에 사용하려는 경우,
|
||||
[`optimum-cli export`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) 명령어를 사용할 수 있습니다:
|
||||
|
||||
```bash
|
||||
@@ -47,7 +47,7 @@ optimum-cli export onnx --model runwayml/stable-diffusion-v1-5 sd_v15_onnx/
|
||||
|
||||
그 다음 추론을 수행합니다:
|
||||
|
||||
```python
|
||||
```python
|
||||
from optimum.onnxruntime import ORTStableDiffusionPipeline
|
||||
|
||||
model_id = "sd_v15_onnx"
|
||||
|
||||
@@ -19,7 +19,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
다음 명령어로 🤗 Optimum을 설치합니다:
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install optimum["openvino"]
|
||||
```
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ image
|
||||
|
||||
먼저 `compel` 라이브러리를 설치해야합니다:
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install compel
|
||||
```
|
||||
|
||||
|
||||
@@ -95,13 +95,13 @@ cd diffusers
|
||||
|
||||
**PyTorch**
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install -e ".[torch]"
|
||||
```
|
||||
|
||||
**Flax**
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install -e ".[flax]"
|
||||
```
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -758,7 +758,7 @@ class TokenEmbeddingsHandler:
|
||||
|
||||
idx += 1
|
||||
|
||||
# copied from train_dreambooth_lora_sdxl_advanced.py
|
||||
# Copied from train_dreambooth_lora_sdxl_advanced.py
|
||||
def save_embeddings(self, file_path: str):
|
||||
assert self.train_ids is not None, "Initialize new tokens before saving embeddings."
|
||||
tensors = {}
|
||||
@@ -981,7 +981,7 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -78,7 +78,7 @@ from diffusers.utils.torch_utils import is_compiled_module
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -1136,7 +1136,7 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -68,6 +68,8 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
|
||||
| InstantID Pipeline | Stable Diffusion XL Pipeline that supports InstantID | [InstantID Pipeline](#instantid-pipeline) | [](https://huggingface.co/spaces/InstantX/InstantID) | [Haofan Wang](https://github.com/haofanwang) |
|
||||
| UFOGen Scheduler | Scheduler for UFOGen Model (compatible with Stable Diffusion pipelines) | [UFOGen Scheduler](#ufogen-scheduler) | - | [dg845](https://github.com/dg845) |
|
||||
| Stable Diffusion XL IPEX Pipeline | Accelerate Stable Diffusion XL inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion XL on IPEX](#stable-diffusion-xl-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |
|
||||
| Stable Diffusion BoxDiff Pipeline | Training-free controlled generation with bounding boxes using [BoxDiff](https://github.com/showlab/BoxDiff) | [Stable Diffusion BoxDiff Pipeline](#stable-diffusion-boxdiff) | - | [Jingyang Zhang](https://github.com/zjysteven/) |
|
||||
| FRESCO V2V Pipeline | Implementation of [[CVPR 2024] FRESCO: Spatial-Temporal Correspondence for Zero-Shot Video Translation](https://arxiv.org/abs/2403.12962) | [FRESCO V2V Pipeline](#fresco) | - | [Yifan Zhou](https://github.com/SingleZombie) |
|
||||
|
||||
To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
|
||||
|
||||
@@ -238,12 +240,12 @@ pipeline_output = pipe(
|
||||
# denoising_steps=10, # (optional) Number of denoising steps of each inference pass. Default: 10.
|
||||
# ensemble_size=10, # (optional) Number of inference passes in the ensemble. Default: 10.
|
||||
# ------------------------------------------------
|
||||
|
||||
|
||||
# ----- recommended setting for LCM version ------
|
||||
# denoising_steps=4,
|
||||
# ensemble_size=5,
|
||||
# -------------------------------------------------
|
||||
|
||||
|
||||
# processing_res=768, # (optional) Maximum resolution of processing. If set to 0: will not resize at all. Defaults to 768.
|
||||
# match_input_res=True, # (optional) Resize depth prediction to match input resolution.
|
||||
# batch_size=0, # (optional) Inference batch size, no bigger than `num_ensemble`. If set to 0, the script will automatically decide the proper batch size. Defaults to 0.
|
||||
@@ -1030,7 +1032,7 @@ image = pipe().images[0]
|
||||
|
||||
Make sure you have @crowsonkb's <https://github.com/crowsonkb/k-diffusion> installed:
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install k-diffusion
|
||||
```
|
||||
|
||||
@@ -1676,6 +1678,68 @@ image = pipe(prompt, image=input_image, strength=0.75,).images[0]
|
||||
image.save('tensorrt_img2img_new_zealand_hills.png')
|
||||
```
|
||||
|
||||
### Stable Diffusion BoxDiff
|
||||
BoxDiff is a training-free method for controlled generation with bounding box coordinates. It shoud work with any Stable Diffusion model. Below shows an example with `stable-diffusion-2-1-base`.
|
||||
```py
|
||||
import torch
|
||||
from PIL import Image, ImageDraw
|
||||
from copy import deepcopy
|
||||
|
||||
from examples.community.pipeline_stable_diffusion_boxdiff import StableDiffusionBoxDiffPipeline
|
||||
|
||||
def draw_box_with_text(img, boxes, names):
|
||||
colors = ["red", "olive", "blue", "green", "orange", "brown", "cyan", "purple"]
|
||||
img_new = deepcopy(img)
|
||||
draw = ImageDraw.Draw(img_new)
|
||||
|
||||
W, H = img.size
|
||||
for bid, box in enumerate(boxes):
|
||||
draw.rectangle([box[0] * W, box[1] * H, box[2] * W, box[3] * H], outline=colors[bid % len(colors)], width=4)
|
||||
draw.text((box[0] * W, box[1] * H), names[bid], fill=colors[bid % len(colors)])
|
||||
return img_new
|
||||
|
||||
pipe = StableDiffusionBoxDiffPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1-base",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe.to("cuda")
|
||||
|
||||
# example 1
|
||||
prompt = "as the aurora lights up the sky, a herd of reindeer leisurely wanders on the grassy meadow, admiring the breathtaking view, a serene lake quietly reflects the magnificent display, and in the distance, a snow-capped mountain stands majestically, fantasy, 8k, highly detailed"
|
||||
phrases = [
|
||||
"aurora",
|
||||
"reindeer",
|
||||
"meadow",
|
||||
"lake",
|
||||
"mountain"
|
||||
]
|
||||
boxes = [[1,3,512,202], [75,344,421,495], [1,327,508,507], [2,217,507,341], [1,135,509,242]]
|
||||
|
||||
# example 2
|
||||
# prompt = "A rabbit wearing sunglasses looks very proud"
|
||||
# phrases = ["rabbit", "sunglasses"]
|
||||
# boxes = [[67,87,366,512], [66,130,364,262]]
|
||||
|
||||
boxes = [[x / 512 for x in box] for box in boxes]
|
||||
|
||||
images = pipe(
|
||||
prompt,
|
||||
boxdiff_phrases=phrases,
|
||||
boxdiff_boxes=boxes,
|
||||
boxdiff_kwargs={
|
||||
"attention_res": 16,
|
||||
"normalize_eot": True
|
||||
},
|
||||
num_inference_steps=50,
|
||||
guidance_scale=7.5,
|
||||
generator=torch.manual_seed(42),
|
||||
safety_checker=None
|
||||
).images
|
||||
|
||||
draw_box_with_text(images[0], boxes, phrases).save("output.png")
|
||||
```
|
||||
|
||||
|
||||
### Stable Diffusion Reference
|
||||
|
||||
This pipeline uses the Reference Control. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).
|
||||
@@ -1790,13 +1854,13 @@ To use this pipeline, you need to:
|
||||
|
||||
You can simply use pip to install IPEX with the latest version.
|
||||
|
||||
```python
|
||||
```sh
|
||||
python -m pip install intel_extension_for_pytorch
|
||||
```
|
||||
|
||||
**Note:** To install a specific version, run with the following command:
|
||||
|
||||
```
|
||||
```sh
|
||||
python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
|
||||
```
|
||||
|
||||
@@ -1894,13 +1958,13 @@ To use this pipeline, you need to:
|
||||
|
||||
You can simply use pip to install IPEX with the latest version.
|
||||
|
||||
```python
|
||||
```sh
|
||||
python -m pip install intel_extension_for_pytorch
|
||||
```
|
||||
|
||||
**Note:** To install a specific version, run with the following command:
|
||||
|
||||
```
|
||||
```sh
|
||||
python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
|
||||
```
|
||||
|
||||
@@ -2946,8 +3010,8 @@ This code implements a pipeline for the Stable Diffusion model, enabling the div
|
||||
|
||||
### Sample Code
|
||||
|
||||
```
|
||||
from from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline
|
||||
```py
|
||||
from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline
|
||||
pipe = RegionalPromptingStableDiffusionPipeline.from_single_file(model_path, vae=vae)
|
||||
|
||||
rp_args = {
|
||||
@@ -3972,6 +4036,93 @@ onestep_image = pipe(prompt, num_inference_steps=1).images[0]
|
||||
multistep_image = pipe(prompt, num_inference_steps=4).images[0]
|
||||
```
|
||||
|
||||
### FRESCO
|
||||
|
||||
This is the Diffusers implementation of zero-shot video-to-video translation pipeline [FRESCO](https://github.com/williamyang1991/FRESCO) (without Ebsynth postprocessing and background smooth). To run the code, please install gmflow. Then modify the path in `gmflow_dir`. After that, you can run the pipeline with:
|
||||
|
||||
```py
|
||||
from PIL import Image
|
||||
import cv2
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from diffusers import ControlNetModel,DDIMScheduler, DiffusionPipeline
|
||||
import sys
|
||||
gmflow_dir = "/path/to/gmflow"
|
||||
sys.path.insert(0, gmflow_dir)
|
||||
|
||||
def video_to_frame(video_path: str, interval: int):
|
||||
vidcap = cv2.VideoCapture(video_path)
|
||||
success = True
|
||||
|
||||
count = 0
|
||||
res = []
|
||||
while success:
|
||||
count += 1
|
||||
success, image = vidcap.read()
|
||||
if count % interval != 1:
|
||||
continue
|
||||
if image is not None:
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
res.append(image)
|
||||
if len(res) >= 8:
|
||||
break
|
||||
|
||||
vidcap.release()
|
||||
return res
|
||||
|
||||
|
||||
input_video_path = 'https://github.com/williamyang1991/FRESCO/raw/main/data/car-turn.mp4'
|
||||
output_video_path = 'car.gif'
|
||||
|
||||
# You can use any fintuned SD here
|
||||
model_path = 'SG161222/Realistic_Vision_V2.0'
|
||||
|
||||
prompt = 'a red car turns in the winter'
|
||||
a_prompt = ', RAW photo, subject, (high detailed skin:1.2), 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3, '
|
||||
n_prompt = '(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers:1.4), (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation'
|
||||
|
||||
input_interval = 5
|
||||
frames = video_to_frame(
|
||||
input_video_path, input_interval)
|
||||
|
||||
control_frames = []
|
||||
# get canny image
|
||||
for frame in frames:
|
||||
image = cv2.Canny(frame, 50, 100)
|
||||
np_image = np.array(image)
|
||||
np_image = np_image[:, :, None]
|
||||
np_image = np.concatenate([np_image, np_image, np_image], axis=2)
|
||||
canny_image = Image.fromarray(np_image)
|
||||
control_frames.append(canny_image)
|
||||
|
||||
# You can use any ControlNet here
|
||||
controlnet = ControlNetModel.from_pretrained(
|
||||
"lllyasviel/sd-controlnet-canny").to('cuda')
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
model_path, controlnet=controlnet, custom_pipeline='fresco_v2v').to('cuda')
|
||||
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
frames = [Image.fromarray(frame) for frame in frames]
|
||||
|
||||
output_frames = pipe(
|
||||
prompt + a_prompt,
|
||||
frames,
|
||||
control_frames,
|
||||
num_inference_steps=20,
|
||||
strength=0.75,
|
||||
controlnet_conditioning_scale=0.7,
|
||||
generator=generator,
|
||||
negative_prompt=n_prompt
|
||||
).images
|
||||
|
||||
output_frames[0].save(output_video_path, save_all=True,
|
||||
append_images=output_frames[1:], duration=100, loop=0)
|
||||
|
||||
```
|
||||
|
||||
# Perturbed-Attention Guidance
|
||||
|
||||
[Project](https://ku-cvlab.github.io/Perturbed-Attention-Guidance/) / [arXiv](https://arxiv.org/abs/2403.17377) / [GitHub](https://github.com/KU-CVLAB/Perturbed-Attention-Guidance)
|
||||
@@ -3980,7 +4131,7 @@ This implementation is based on [Diffusers](https://huggingface.co/docs/diffuser
|
||||
|
||||
## Example Usage
|
||||
|
||||
```
|
||||
```py
|
||||
import os
|
||||
import torch
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
EXAMPLE_DOC_STRING = """
|
||||
Examples:
|
||||
```
|
||||
```py
|
||||
from io import BytesIO
|
||||
|
||||
import requests
|
||||
|
||||
2511
examples/community/fresco_v2v.py
Normal file
2511
examples/community/fresco_v2v.py
Normal file
File diff suppressed because it is too large
Load Diff
468
examples/community/kohya_hires_fix.py
Normal file
468
examples/community/kohya_hires_fix.py
Normal file
@@ -0,0 +1,468 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.utils.checkpoint
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
||||
|
||||
from diffusers.configuration_utils import register_to_config
|
||||
from diffusers.image_processor import VaeImageProcessor
|
||||
from diffusers.models.autoencoders import AutoencoderKL
|
||||
from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel, UNet2DConditionOutput
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import KarrasDiffusionSchedulers
|
||||
from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
class UNet2DConditionModelHighResFix(UNet2DConditionModel):
|
||||
r"""
|
||||
A conditional 2D UNet model that applies Kohya fix proposed for high resolution image generation.
|
||||
|
||||
This model inherits from [`UNet2DConditionModel`]. Check the superclass documentation for learning about all the parameters.
|
||||
|
||||
Parameters:
|
||||
high_res_fix (`List[Dict]`, *optional*, defaults to `[{'timestep': 600, 'scale_factor': 0.5, 'block_num': 1}]`):
|
||||
Enables Kohya fix for high resolution generation. The activation maps are scaled based on the scale_factor up to the timestep at specified block_num.
|
||||
"""
|
||||
|
||||
_supports_gradient_checkpointing = True
|
||||
|
||||
@register_to_config
|
||||
def __init__(self, high_res_fix: List[Dict] = [{"timestep": 600, "scale_factor": 0.5, "block_num": 1}], **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if high_res_fix:
|
||||
self.config.high_res_fix = sorted(high_res_fix, key=lambda x: x["timestep"], reverse=True)
|
||||
|
||||
@classmethod
|
||||
def _resize(cls, sample, target=None, scale_factor=1, mode="bicubic"):
|
||||
dtype = sample.dtype
|
||||
if dtype == torch.bfloat16:
|
||||
sample = sample.to(torch.float32)
|
||||
|
||||
if target is not None:
|
||||
if sample.shape[-2:] != target.shape[-2:]:
|
||||
sample = nn.functional.interpolate(sample, size=target.shape[-2:], mode=mode, align_corners=False)
|
||||
elif scale_factor != 1:
|
||||
sample = nn.functional.interpolate(sample, scale_factor=scale_factor, mode=mode, align_corners=False)
|
||||
|
||||
return sample.to(dtype)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
sample: torch.FloatTensor,
|
||||
timestep: Union[torch.Tensor, float, int],
|
||||
encoder_hidden_states: torch.Tensor,
|
||||
class_labels: Optional[torch.Tensor] = None,
|
||||
timestep_cond: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
||||
added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
|
||||
down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
|
||||
mid_block_additional_residual: Optional[torch.Tensor] = None,
|
||||
down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
|
||||
encoder_attention_mask: Optional[torch.Tensor] = None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[UNet2DConditionOutput, Tuple]:
|
||||
r"""
|
||||
The [`UNet2DConditionModel`] forward method.
|
||||
|
||||
Args:
|
||||
sample (`torch.FloatTensor`):
|
||||
The noisy input tensor with the following shape `(batch, channel, height, width)`.
|
||||
timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
|
||||
encoder_hidden_states (`torch.FloatTensor`):
|
||||
The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
|
||||
class_labels (`torch.Tensor`, *optional*, defaults to `None`):
|
||||
Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
|
||||
timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
|
||||
Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
|
||||
through the `self.time_embedding` layer to obtain the timestep embeddings.
|
||||
attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
|
||||
An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
|
||||
is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
|
||||
negative values to the attention scores corresponding to "discard" tokens.
|
||||
cross_attention_kwargs (`dict`, *optional*):
|
||||
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
||||
`self.processor` in
|
||||
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
added_cond_kwargs: (`dict`, *optional*):
|
||||
A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
|
||||
are passed along to the UNet blocks.
|
||||
down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
|
||||
A tuple of tensors that if specified are added to the residuals of down unet blocks.
|
||||
mid_block_additional_residual: (`torch.Tensor`, *optional*):
|
||||
A tensor that if specified is added to the residual of the middle unet block.
|
||||
down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
|
||||
additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
|
||||
encoder_attention_mask (`torch.Tensor`):
|
||||
A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
|
||||
`True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
|
||||
which adds large negative values to the attention scores corresponding to "discard" tokens.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
|
||||
tuple.
|
||||
|
||||
Returns:
|
||||
[`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
|
||||
If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
|
||||
otherwise a `tuple` is returned where the first element is the sample tensor.
|
||||
"""
|
||||
# By default samples have to be AT least a multiple of the overall upsampling factor.
|
||||
# The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
|
||||
# However, the upsampling interpolation output size can be forced to fit any upsampling size
|
||||
# on the fly if necessary.
|
||||
default_overall_up_factor = 2**self.num_upsamplers
|
||||
|
||||
# upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
|
||||
forward_upsample_size = False
|
||||
upsample_size = None
|
||||
|
||||
for dim in sample.shape[-2:]:
|
||||
if dim % default_overall_up_factor != 0:
|
||||
# Forward upsample size to force interpolation output size.
|
||||
forward_upsample_size = True
|
||||
break
|
||||
|
||||
# ensure attention_mask is a bias, and give it a singleton query_tokens dimension
|
||||
# expects mask of shape:
|
||||
# [batch, key_tokens]
|
||||
# adds singleton query_tokens dimension:
|
||||
# [batch, 1, key_tokens]
|
||||
# this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
|
||||
# [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
|
||||
# [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
|
||||
if attention_mask is not None:
|
||||
# assume that mask is expressed as:
|
||||
# (1 = keep, 0 = discard)
|
||||
# convert mask into a bias that can be added to attention scores:
|
||||
# (keep = +0, discard = -10000.0)
|
||||
attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
|
||||
attention_mask = attention_mask.unsqueeze(1)
|
||||
|
||||
# convert encoder_attention_mask to a bias the same way we do for attention_mask
|
||||
if encoder_attention_mask is not None:
|
||||
encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
|
||||
encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
|
||||
|
||||
# 0. center input if necessary
|
||||
if self.config.center_input_sample:
|
||||
sample = 2 * sample - 1.0
|
||||
|
||||
# 1. time
|
||||
t_emb = self.get_time_embed(sample=sample, timestep=timestep)
|
||||
emb = self.time_embedding(t_emb, timestep_cond)
|
||||
aug_emb = None
|
||||
|
||||
class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
|
||||
if class_emb is not None:
|
||||
if self.config.class_embeddings_concat:
|
||||
emb = torch.cat([emb, class_emb], dim=-1)
|
||||
else:
|
||||
emb = emb + class_emb
|
||||
|
||||
aug_emb = self.get_aug_embed(
|
||||
emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
|
||||
)
|
||||
if self.config.addition_embed_type == "image_hint":
|
||||
aug_emb, hint = aug_emb
|
||||
sample = torch.cat([sample, hint], dim=1)
|
||||
|
||||
emb = emb + aug_emb if aug_emb is not None else emb
|
||||
|
||||
if self.time_embed_act is not None:
|
||||
emb = self.time_embed_act(emb)
|
||||
|
||||
encoder_hidden_states = self.process_encoder_hidden_states(
|
||||
encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
|
||||
)
|
||||
|
||||
# 2. pre-process
|
||||
sample = self.conv_in(sample)
|
||||
|
||||
# 2.5 GLIGEN position net
|
||||
if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
|
||||
cross_attention_kwargs = cross_attention_kwargs.copy()
|
||||
gligen_args = cross_attention_kwargs.pop("gligen")
|
||||
cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
|
||||
|
||||
# 3. down
|
||||
# we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
|
||||
# to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
|
||||
if cross_attention_kwargs is not None:
|
||||
cross_attention_kwargs = cross_attention_kwargs.copy()
|
||||
lora_scale = cross_attention_kwargs.pop("scale", 1.0)
|
||||
else:
|
||||
lora_scale = 1.0
|
||||
|
||||
if USE_PEFT_BACKEND:
|
||||
# weight the lora layers by setting `lora_scale` for each PEFT layer
|
||||
scale_lora_layers(self, lora_scale)
|
||||
|
||||
is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
|
||||
# using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
|
||||
is_adapter = down_intrablock_additional_residuals is not None
|
||||
# maintain backward compatibility for legacy usage, where
|
||||
# T2I-Adapter and ControlNet both use down_block_additional_residuals arg
|
||||
# but can only use one or the other
|
||||
if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
|
||||
deprecate(
|
||||
"T2I should not use down_block_additional_residuals",
|
||||
"1.3.0",
|
||||
"Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
|
||||
and will be removed in diffusers 1.3.0. `down_block_additional_residuals` should only be used \
|
||||
for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
|
||||
standard_warn=False,
|
||||
)
|
||||
down_intrablock_additional_residuals = down_block_additional_residuals
|
||||
is_adapter = True
|
||||
|
||||
down_block_res_samples = (sample,)
|
||||
for down_i, downsample_block in enumerate(self.down_blocks):
|
||||
if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
|
||||
# For t2i-adapter CrossAttnDownBlock2D
|
||||
additional_residuals = {}
|
||||
if is_adapter and len(down_intrablock_additional_residuals) > 0:
|
||||
additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
|
||||
|
||||
sample, res_samples = downsample_block(
|
||||
hidden_states=sample,
|
||||
temb=emb,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
**additional_residuals,
|
||||
)
|
||||
|
||||
else:
|
||||
sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
|
||||
if is_adapter and len(down_intrablock_additional_residuals) > 0:
|
||||
sample += down_intrablock_additional_residuals.pop(0)
|
||||
|
||||
down_block_res_samples += res_samples
|
||||
|
||||
# kohya high res fix
|
||||
if self.config.high_res_fix:
|
||||
for high_res_fix in self.config.high_res_fix:
|
||||
if timestep > high_res_fix["timestep"] and down_i == high_res_fix["block_num"]:
|
||||
sample = self.__class__._resize(sample, scale_factor=high_res_fix["scale_factor"])
|
||||
break
|
||||
|
||||
if is_controlnet:
|
||||
new_down_block_res_samples = ()
|
||||
|
||||
for down_block_res_sample, down_block_additional_residual in zip(
|
||||
down_block_res_samples, down_block_additional_residuals
|
||||
):
|
||||
down_block_res_sample = down_block_res_sample + down_block_additional_residual
|
||||
new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
|
||||
|
||||
down_block_res_samples = new_down_block_res_samples
|
||||
|
||||
# 4. mid
|
||||
if self.mid_block is not None:
|
||||
if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
|
||||
sample = self.mid_block(
|
||||
sample,
|
||||
emb,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
)
|
||||
else:
|
||||
sample = self.mid_block(sample, emb)
|
||||
|
||||
# To support T2I-Adapter-XL
|
||||
if (
|
||||
is_adapter
|
||||
and len(down_intrablock_additional_residuals) > 0
|
||||
and sample.shape == down_intrablock_additional_residuals[0].shape
|
||||
):
|
||||
sample += down_intrablock_additional_residuals.pop(0)
|
||||
|
||||
if is_controlnet:
|
||||
sample = sample + mid_block_additional_residual
|
||||
|
||||
# 5. up
|
||||
for i, upsample_block in enumerate(self.up_blocks):
|
||||
is_final_block = i == len(self.up_blocks) - 1
|
||||
|
||||
res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
|
||||
down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
|
||||
|
||||
# up scaling of kohya high res fix
|
||||
if self.config.high_res_fix is not None:
|
||||
if res_samples[0].shape[-2:] != sample.shape[-2:]:
|
||||
sample = self.__class__._resize(sample, target=res_samples[0])
|
||||
res_samples_up_sampled = (res_samples[0],)
|
||||
for res_sample in res_samples[1:]:
|
||||
res_samples_up_sampled += (self.__class__._resize(res_sample, target=res_samples[0]),)
|
||||
res_samples = res_samples_up_sampled
|
||||
|
||||
# if we have not reached the final block and need to forward the
|
||||
# upsample size, we do it here
|
||||
if not is_final_block and forward_upsample_size:
|
||||
upsample_size = down_block_res_samples[-1].shape[2:]
|
||||
|
||||
if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
|
||||
sample = upsample_block(
|
||||
hidden_states=sample,
|
||||
temb=emb,
|
||||
res_hidden_states_tuple=res_samples,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
cross_attention_kwargs=cross_attention_kwargs,
|
||||
upsample_size=upsample_size,
|
||||
attention_mask=attention_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
)
|
||||
else:
|
||||
sample = upsample_block(
|
||||
hidden_states=sample,
|
||||
temb=emb,
|
||||
res_hidden_states_tuple=res_samples,
|
||||
upsample_size=upsample_size,
|
||||
)
|
||||
|
||||
# 6. post-process
|
||||
if self.conv_norm_out:
|
||||
sample = self.conv_norm_out(sample)
|
||||
sample = self.conv_act(sample)
|
||||
sample = self.conv_out(sample)
|
||||
|
||||
if USE_PEFT_BACKEND:
|
||||
# remove `lora_scale` from each PEFT layer
|
||||
unscale_lora_layers(self, lora_scale)
|
||||
|
||||
if not return_dict:
|
||||
return (sample,)
|
||||
|
||||
return UNet2DConditionOutput(sample=sample)
|
||||
|
||||
@classmethod
|
||||
def from_unet(cls, unet: UNet2DConditionModel, high_res_fix: list):
|
||||
config = dict((unet.config))
|
||||
config["high_res_fix"] = high_res_fix
|
||||
unet_high_res = cls(**config)
|
||||
unet_high_res.load_state_dict(unet.state_dict())
|
||||
unet_high_res.to(unet.dtype)
|
||||
return unet_high_res
|
||||
|
||||
|
||||
EXAMPLE_DOC_STRING = """
|
||||
Examples:
|
||||
```py
|
||||
>>> import torch
|
||||
>>> from diffusers import DiffusionPipeline
|
||||
|
||||
>>> pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="kohya_hires_fix",
|
||||
torch_dtype=torch.float16,
|
||||
high_res_fix=[{'timestep': 600,
|
||||
'scale_factor': 0.5,
|
||||
'block_num': 1}])
|
||||
>>> pipe = pipe.to("cuda")
|
||||
|
||||
>>> prompt = "a photo of an astronaut riding a horse on mars"
|
||||
>>> image = pipe(prompt, height=1000, width=1600).images[0]
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
class StableDiffusionHighResFixPipeline(StableDiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion with Kohya fix for high resolution generation.
|
||||
|
||||
This model inherits from [`StableDiffusionPipeline`]. Check the superclass documentation for the generic methods.
|
||||
|
||||
The pipeline also inherits the following loading methods:
|
||||
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
||||
- [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
||||
- [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
||||
- [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
|
||||
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
|
||||
|
||||
Args:
|
||||
vae ([`AutoencoderKL`]):
|
||||
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
|
||||
text_encoder ([`~transformers.CLIPTextModel`]):
|
||||
Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
|
||||
tokenizer ([`~transformers.CLIPTokenizer`]):
|
||||
A `CLIPTokenizer` to tokenize text.
|
||||
unet ([`UNet2DConditionModel`]):
|
||||
A `UNet2DConditionModel` to denoise the encoded image latents.
|
||||
scheduler ([`SchedulerMixin`]):
|
||||
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
|
||||
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
||||
safety_checker ([`StableDiffusionSafetyChecker`]):
|
||||
Classification module that estimates whether generated images could be considered offensive or harmful.
|
||||
Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
|
||||
about a model's potential harms.
|
||||
feature_extractor ([`~transformers.CLIPImageProcessor`]):
|
||||
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
|
||||
high_res_fix (`List[Dict]`, *optional*, defaults to `[{'timestep': 600, 'scale_factor': 0.5, 'block_num': 1}]`):
|
||||
Enables Kohya fix for high resolution generation. The activation maps are scaled based on the scale_factor up to the timestep at specified block_num.
|
||||
"""
|
||||
|
||||
model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
|
||||
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
|
||||
_exclude_from_cpu_offload = ["safety_checker"]
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vae: AutoencoderKL,
|
||||
text_encoder: CLIPTextModel,
|
||||
tokenizer: CLIPTokenizer,
|
||||
unet: UNet2DConditionModel,
|
||||
scheduler: KarrasDiffusionSchedulers,
|
||||
safety_checker: StableDiffusionSafetyChecker,
|
||||
feature_extractor: CLIPImageProcessor,
|
||||
image_encoder: CLIPVisionModelWithProjection = None,
|
||||
requires_safety_checker: bool = True,
|
||||
high_res_fix: List[Dict] = [{"timestep": 600, "scale_factor": 0.5, "block_num": 1}],
|
||||
):
|
||||
super().__init__(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=feature_extractor,
|
||||
image_encoder=image_encoder,
|
||||
requires_safety_checker=requires_safety_checker,
|
||||
)
|
||||
|
||||
unet = UNet2DConditionModelHighResFix.from_unet(unet=unet, high_res_fix=high_res_fix)
|
||||
self.register_modules(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=feature_extractor,
|
||||
image_encoder=image_encoder,
|
||||
)
|
||||
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
||||
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
||||
@@ -565,7 +565,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
|
||||
# Glide cosine schedule
|
||||
self.betas = betas_for_alpha_bar(num_train_timesteps)
|
||||
else:
|
||||
raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
|
||||
raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
|
||||
|
||||
# Rescale for zero SNR
|
||||
if rescale_betas_zero_snr:
|
||||
|
||||
@@ -477,7 +477,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Glide cosine schedule
|
||||
self.betas = betas_for_alpha_bar(num_train_timesteps)
|
||||
else:
|
||||
raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
|
||||
raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
|
||||
|
||||
# Rescale for zero SNR
|
||||
if rescale_betas_zero_snr:
|
||||
|
||||
@@ -1524,35 +1524,35 @@ class LLMGroundedDiffusionPipeline(
|
||||
assert emb.shape == (w.shape[0], embedding_dim)
|
||||
return emb
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_scale
|
||||
@property
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_scale
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_rescale
|
||||
@property
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_rescale
|
||||
def guidance_rescale(self):
|
||||
return self._guidance_rescale
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.clip_skip
|
||||
@property
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.clip_skip
|
||||
def clip_skip(self):
|
||||
return self._clip_skip
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.do_classifier_free_guidance
|
||||
@property
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.do_classifier_free_guidance
|
||||
def do_classifier_free_guidance(self):
|
||||
return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.cross_attention_kwargs
|
||||
@property
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.cross_attention_kwargs
|
||||
def cross_attention_kwargs(self):
|
||||
return self._cross_attention_kwargs
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.num_timesteps
|
||||
@property
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.num_timesteps
|
||||
def num_timesteps(self):
|
||||
return self._num_timesteps
|
||||
|
||||
@@ -43,7 +43,7 @@ from diffusers.utils import BaseOutput, check_min_version
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.25.0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
|
||||
class MarigoldDepthOutput(BaseOutput):
|
||||
|
||||
1700
examples/community/pipeline_stable_diffusion_boxdiff.py
Normal file
1700
examples/community/pipeline_stable_diffusion_boxdiff.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -460,7 +460,7 @@ class StableDiffusionUpscaleLDM3DPipeline(
|
||||
)
|
||||
|
||||
# verify batch size of prompt and image are same if image is a list or tensor or numpy array
|
||||
if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray):
|
||||
if isinstance(image, (list, np.ndarray, torch.Tensor)):
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
|
||||
@@ -218,7 +218,7 @@ class UFOGenScheduler(SchedulerMixin, ConfigMixin):
|
||||
betas = torch.linspace(-6, 6, num_train_timesteps)
|
||||
self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
|
||||
else:
|
||||
raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
|
||||
raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
|
||||
|
||||
# Rescale for zero SNR
|
||||
if rescale_betas_zero_snr:
|
||||
|
||||
@@ -78,7 +78,7 @@ def torch_dfs(model: torch.nn.Module):
|
||||
class StableDiffusionReferencePipeline(
|
||||
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
|
||||
):
|
||||
r""" "
|
||||
r"""
|
||||
Pipeline for Stable Diffusion Reference.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
||||
|
||||
@@ -113,9 +113,9 @@ accelerate launch train_lcm_distill_lora_sdxl_wds.py \
|
||||
--push_to_hub \
|
||||
```
|
||||
|
||||
We provide another version for LCM LoRA SDXL that follows best practices of `peft` and leverages the `datasets` library for quick experimentation. The script doesn't load two UNets unlike `train_lcm_distill_lora_sdxl_wds.py` which reduces the memory requirements quite a bit.
|
||||
We provide another version for LCM LoRA SDXL that follows best practices of `peft` and leverages the `datasets` library for quick experimentation. The script doesn't load two UNets unlike `train_lcm_distill_lora_sdxl_wds.py` which reduces the memory requirements quite a bit.
|
||||
|
||||
Below is an example training command that trains an LCM LoRA on the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions):
|
||||
Below is an example training command that trains an LCM LoRA on the [Narutos dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions):
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
|
||||
@@ -125,7 +125,7 @@ export VAE_PATH="madebyollin/sdxl-vae-fp16-fix"
|
||||
accelerate launch train_lcm_distill_lora_sdxl.py \
|
||||
--pretrained_teacher_model=${MODEL_NAME} \
|
||||
--pretrained_vae_model_name_or_path=${VAE_PATH} \
|
||||
--output_dir="pokemons-lora-lcm-sdxl" \
|
||||
--output_dir="narutos-lora-lcm-sdxl" \
|
||||
--mixed_precision="fp16" \
|
||||
--dataset_name=$DATASET_NAME \
|
||||
--resolution=1024 \
|
||||
|
||||
@@ -73,7 +73,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -66,7 +66,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -79,7 +79,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -1358,7 +1358,7 @@ def main(args):
|
||||
# estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
|
||||
# solver timestep.
|
||||
with torch.no_grad():
|
||||
if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
|
||||
if torch.backends.mps.is_available() or "playground" in args.pretrained_teacher_model:
|
||||
autocast_ctx = nullcontext()
|
||||
else:
|
||||
autocast_ctx = torch.autocast(accelerator.device.type)
|
||||
|
||||
@@ -72,7 +72,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -101,7 +101,7 @@ accelerate launch train_controlnet.py \
|
||||
`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
|
||||
for running distributed training with `accelerate`. Here is an example command:
|
||||
|
||||
```bash
|
||||
```bash
|
||||
export MODEL_DIR="runwayml/stable-diffusion-v1-5"
|
||||
export OUTPUT_DIR="path to save model"
|
||||
|
||||
@@ -123,21 +123,21 @@ accelerate launch --mixed_precision="fp16" --multi_gpu train_controlnet.py \
|
||||
|
||||
#### After 300 steps with batch size 8
|
||||
|
||||
| | |
|
||||
| | |
|
||||
|-------------------|:-------------------------:|
|
||||
| | red circle with blue background |
|
||||
| | red circle with blue background |
|
||||
 |  |
|
||||
| | cyan circle with brown floral background |
|
||||
| | cyan circle with brown floral background |
|
||||
 |  |
|
||||
|
||||
|
||||
#### After 6000 steps with batch size 8:
|
||||
|
||||
| | |
|
||||
| | |
|
||||
|-------------------|:-------------------------:|
|
||||
| | red circle with blue background |
|
||||
| | red circle with blue background |
|
||||
 |  |
|
||||
| | cyan circle with brown floral background |
|
||||
| | cyan circle with brown floral background |
|
||||
 |  |
|
||||
|
||||
## Training on a 16 GB GPU
|
||||
@@ -194,7 +194,7 @@ accelerate launch train_controlnet.py \
|
||||
--set_grads_to_none
|
||||
```
|
||||
|
||||
When using `enable_xformers_memory_efficient_attention`, please make sure to install `xformers` by `pip install xformers`.
|
||||
When using `enable_xformers_memory_efficient_attention`, please make sure to install `xformers` by `pip install xformers`.
|
||||
|
||||
## Training on an 8 GB GPU
|
||||
|
||||
@@ -209,7 +209,7 @@ Optimizations:
|
||||
- DeepSpeed stage 2 with parameter and optimizer offloading
|
||||
- fp16 mixed precision
|
||||
|
||||
[DeepSpeed](https://www.deepspeed.ai/) can offload tensors from VRAM to either
|
||||
[DeepSpeed](https://www.deepspeed.ai/) can offload tensors from VRAM to either
|
||||
CPU or NVME. This requires significantly more RAM (about 25 GB).
|
||||
|
||||
Use `accelerate config` to enable DeepSpeed stage 2.
|
||||
@@ -256,7 +256,7 @@ accelerate launch train_controlnet.py \
|
||||
## Performing inference with the trained ControlNet
|
||||
|
||||
The trained model can be run the same as the original ControlNet pipeline with the newly trained ControlNet.
|
||||
Set `base_model_path` and `controlnet_path` to the values `--pretrained_model_name_or_path` and
|
||||
Set `base_model_path` and `controlnet_path` to the values `--pretrained_model_name_or_path` and
|
||||
`--output_dir` were respectively set to in the training script.
|
||||
|
||||
```py
|
||||
@@ -315,13 +315,13 @@ gcloud alpha compute tpus tpu-vm ssh $VM_NAME --zone $ZONE -- \
|
||||
|
||||
When connected install JAX `0.4.5`:
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install "jax[tpu]==0.4.5" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
|
||||
```
|
||||
|
||||
To verify that JAX was correctly installed, you can run the following command:
|
||||
|
||||
```
|
||||
```py
|
||||
import jax
|
||||
jax.device_count()
|
||||
```
|
||||
@@ -351,14 +351,14 @@ pip install wandb
|
||||
|
||||
Now let's downloading two conditioning images that we will use to run validation during the training in order to track our progress
|
||||
|
||||
```
|
||||
```sh
|
||||
wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png
|
||||
wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
|
||||
```
|
||||
|
||||
We encourage you to store or share your model with the community. To use huggingface hub, please login to your Hugging Face account, or ([create one](https://huggingface.co/docs/diffusers/main/en/training/hf.co/join) if you don’t have one already):
|
||||
|
||||
```
|
||||
```sh
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
@@ -429,12 +429,12 @@ When work with a larger dataset, you may need to run training process for a long
|
||||
```bash
|
||||
--checkpointing_steps=500
|
||||
```
|
||||
This will save the trained model in subfolders of your output_dir. Subfolder names is the number of steps performed so far; for example: a checkpoint saved after 500 training steps would be saved in a subfolder named 500
|
||||
This will save the trained model in subfolders of your output_dir. Subfolder names is the number of steps performed so far; for example: a checkpoint saved after 500 training steps would be saved in a subfolder named 500
|
||||
|
||||
You can then start your training from this saved checkpoint with
|
||||
You can then start your training from this saved checkpoint with
|
||||
|
||||
```bash
|
||||
--controlnet_model_name_or_path="./control_out/500"
|
||||
--controlnet_model_name_or_path="./control_out/500"
|
||||
```
|
||||
|
||||
We support training with the Min-SNR weighting strategy proposed in [Efficient Diffusion Training via Min-SNR Weighting Strategy](https://arxiv.org/abs/2303.09556) which helps to achieve faster convergence by rebalancing the loss. To use it, one needs to set the `--snr_gamma` argument. The recommended value when using it is `5.0`.
|
||||
|
||||
@@ -60,7 +60,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -60,7 +60,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
if is_torch_npu_available():
|
||||
|
||||
@@ -63,7 +63,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -152,7 +152,7 @@ def collate_fn(examples, with_prior_preservation):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
@@ -43,7 +43,7 @@ from accelerate.utils import write_basic_config
|
||||
write_basic_config()
|
||||
```
|
||||
|
||||
When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
|
||||
When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
|
||||
Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
|
||||
|
||||
### Dog toy example
|
||||
@@ -231,7 +231,7 @@ accelerate launch --mixed_precision="fp16" train_dreambooth.py \
|
||||
|
||||
### Fine-tune text encoder with the UNet.
|
||||
|
||||
The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces.
|
||||
The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces.
|
||||
Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
|
||||
|
||||
___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
|
||||
@@ -303,7 +303,7 @@ In a nutshell, LoRA allows to adapt pretrained models by adding pairs of rank-de
|
||||
- Rank-decomposition matrices have significantly fewer parameters than the original model, which means that trained LoRA weights are easily portable.
|
||||
- LoRA attention layers allow to control to which extent the model is adapted towards new training images via a `scale` parameter.
|
||||
|
||||
[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in
|
||||
[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in
|
||||
the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
|
||||
|
||||
### Training
|
||||
@@ -326,7 +326,7 @@ export INSTANCE_DIR="dog"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
```
|
||||
|
||||
For this example we want to directly store the trained LoRA embeddings on the Hub, so
|
||||
For this example we want to directly store the trained LoRA embeddings on the Hub, so
|
||||
we need to be logged in and add the `--push_to_hub` flag.
|
||||
|
||||
```bash
|
||||
@@ -356,7 +356,7 @@ accelerate launch train_dreambooth_lora.py \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
**___Note: When using LoRA we can use a much higher learning rate compared to vanilla dreambooth. Here we
|
||||
**___Note: When using LoRA we can use a much higher learning rate compared to vanilla dreambooth. Here we
|
||||
use *1e-4* instead of the usual *2e-6*.___**
|
||||
|
||||
The final LoRA embedding weights have been uploaded to [patrickvonplaten/lora_dreambooth_dog_example](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example). **___Note: [The final weights](https://huggingface.co/patrickvonplaten/lora/blob/main/pytorch_attn_procs.bin) are only 3 MB in size which is orders of magnitudes smaller than the original model.**
|
||||
@@ -365,14 +365,14 @@ The training results are summarized [here](https://api.wandb.ai/report/patrickvo
|
||||
You can use the `Step` slider to see how the model learned the features of our subject while the model trained.
|
||||
|
||||
Optionally, we can also train additional LoRA layers for the text encoder. Specify the `--train_text_encoder` argument above for that. If you're interested to know more about how we
|
||||
enable this support, check out this [PR](https://github.com/huggingface/diffusers/pull/2918).
|
||||
enable this support, check out this [PR](https://github.com/huggingface/diffusers/pull/2918).
|
||||
|
||||
With the default hyperparameters from the above, the training seems to go in a positive direction. Check out [this panel](https://wandb.ai/sayakpaul/dreambooth-lora/reports/test-23-04-17-17-00-13---Vmlldzo0MDkwNjMy). The trained LoRA layers are available [here](https://huggingface.co/sayakpaul/dreambooth).
|
||||
|
||||
|
||||
### Inference
|
||||
|
||||
After training, LoRA weights can be loaded very easily into the original pipeline. First, you need to
|
||||
After training, LoRA weights can be loaded very easily into the original pipeline. First, you need to
|
||||
load the original pipeline:
|
||||
|
||||
```python
|
||||
@@ -394,9 +394,9 @@ image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).image
|
||||
|
||||
If you are loading the LoRA parameters from the Hub and if the Hub repository has
|
||||
a `base_model` tag (such as [this](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example/blob/main/README.md?code=true#L4)), then
|
||||
you can do:
|
||||
you can do:
|
||||
|
||||
```py
|
||||
```py
|
||||
from huggingface_hub.repocard import RepoCard
|
||||
|
||||
lora_model_id = "patrickvonplaten/lora_dreambooth_dog_example"
|
||||
@@ -413,7 +413,7 @@ weights. For example:
|
||||
```python
|
||||
from huggingface_hub.repocard import RepoCard
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
import torch
|
||||
|
||||
lora_model_id = "sayakpaul/dreambooth-text-encoder-test"
|
||||
card = RepoCard.load(lora_model_id)
|
||||
@@ -430,7 +430,7 @@ Note that the use of [`LoraLoaderMixin.load_lora_weights`](https://huggingface.c
|
||||
|
||||
* LoRA parameters that don't have separate identifiers for the UNet and the text encoder (such as [`"patrickvonplaten/lora_dreambooth_dog_example"`](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example)). So, you can just do:
|
||||
|
||||
```py
|
||||
```py
|
||||
pipe.load_lora_weights(lora_model_path)
|
||||
```
|
||||
|
||||
@@ -529,11 +529,11 @@ To save even more memory, pass the `--set_grads_to_none` argument to the script.
|
||||
More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
|
||||
|
||||
### Experimental results
|
||||
You can refer to [this blog post](https://huggingface.co/blog/dreambooth) that discusses some of DreamBooth experiments in detail. Specifically, it recommends a set of DreamBooth-specific tips and tricks that we have found to work well for a variety of subjects.
|
||||
You can refer to [this blog post](https://huggingface.co/blog/dreambooth) that discusses some of DreamBooth experiments in detail. Specifically, it recommends a set of DreamBooth-specific tips and tricks that we have found to work well for a variety of subjects.
|
||||
|
||||
## IF
|
||||
|
||||
You can use the lora and full dreambooth scripts to train the text to image [IF model](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0) and the stage II upscaler
|
||||
You can use the lora and full dreambooth scripts to train the text to image [IF model](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0) and the stage II upscaler
|
||||
[IF model](https://huggingface.co/DeepFloyd/IF-II-L-v1.0).
|
||||
|
||||
Note that IF has a predicted variance, and our finetuning scripts only train the models predicted error, so for finetuned IF models we switch to a fixed
|
||||
@@ -553,7 +553,7 @@ pipe.scheduler = pipe.scheduler.__class__.from_config(pipe.scheduler.config, var
|
||||
|
||||
Additionally, a few alternative cli flags are needed for IF.
|
||||
|
||||
`--resolution=64`: IF is a pixel space diffusion model. In order to operate on un-compressed pixels, the input images are of a much smaller resolution.
|
||||
`--resolution=64`: IF is a pixel space diffusion model. In order to operate on un-compressed pixels, the input images are of a much smaller resolution.
|
||||
|
||||
`--pre_compute_text_embeddings`: IF uses [T5](https://huggingface.co/docs/transformers/model_doc/t5) for its text encoder. In order to save GPU memory, we pre compute all text embeddings and then de-allocate
|
||||
T5.
|
||||
@@ -568,7 +568,7 @@ We find LoRA to be sufficient for finetuning the stage I model as the low resolu
|
||||
For common and/or not-visually complex object concepts, you can get away with not-finetuning the upscaler. Just be sure to adjust the prompt passed to the
|
||||
upscaler to remove the new token from the instance prompt. I.e. if your stage I prompt is "a sks dog", use "a dog" for your stage II prompt.
|
||||
|
||||
For finegrained detail like faces that aren't present in the original training set, we find that full finetuning of the stage II upscaler is better than
|
||||
For finegrained detail like faces that aren't present in the original training set, we find that full finetuning of the stage II upscaler is better than
|
||||
LoRA finetuning stage II.
|
||||
|
||||
For finegrained detail like faces, we find that lower learning rates along with larger batch sizes work best.
|
||||
@@ -647,7 +647,7 @@ python train_dreambooth_lora.py \
|
||||
--resolution=256 \
|
||||
--train_batch_size=4 \
|
||||
--gradient_accumulation_steps=1 \
|
||||
--learning_rate=1e-6 \
|
||||
--learning_rate=1e-6 \
|
||||
--max_train_steps=2000 \
|
||||
--validation_prompt="a sks dog" \
|
||||
--validation_epochs=100 \
|
||||
@@ -663,9 +663,9 @@ python train_dreambooth_lora.py \
|
||||
`--skip_save_text_encoder`: When training the full model, this will skip saving the entire T5 with the finetuned model. You can still load the pipeline
|
||||
with a T5 loaded from the original model.
|
||||
|
||||
`use_8bit_adam`: Due to the size of the optimizer states, we recommend training the full XL IF model with 8bit adam.
|
||||
`use_8bit_adam`: Due to the size of the optimizer states, we recommend training the full XL IF model with 8bit adam.
|
||||
|
||||
`--learning_rate=1e-7`: For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade. Note that it is
|
||||
`--learning_rate=1e-7`: For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade. Note that it is
|
||||
likely the learning rate can be increased with larger batch sizes.
|
||||
|
||||
Using 8bit adam and a batch size of 4, the model can be trained in ~48 GB VRAM.
|
||||
@@ -741,4 +741,4 @@ accelerate launch train_dreambooth.py \
|
||||
|
||||
## Stable Diffusion XL
|
||||
|
||||
We support fine-tuning of the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with DreamBooth and LoRA via the `train_dreambooth_lora_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md).
|
||||
We support fine-tuning of the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with DreamBooth and LoRA via the `train_dreambooth_lora_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md).
|
||||
|
||||
141
examples/dreambooth/README_sd3.md
Normal file
141
examples/dreambooth/README_sd3.md
Normal file
@@ -0,0 +1,141 @@
|
||||
# DreamBooth training example for Stable Diffusion 3 (SD3)
|
||||
|
||||
[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few (3~5) images of a subject.
|
||||
|
||||
The `train_dreambooth_sd3.py` script shows how to implement the training procedure and adapt it for [Stable Diffusion 3](https://huggingface.co/papers/2403.03206). We also provide a LoRA implementation in the `train_dreambooth_lora_sd3.py` script.
|
||||
|
||||
> [!NOTE]
|
||||
> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
## Running locally with PyTorch
|
||||
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
**Important**
|
||||
|
||||
To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/diffusers
|
||||
cd diffusers
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
Then cd in the `examples/dreambooth` folder and run
|
||||
```bash
|
||||
pip install -r requirements_sd3.txt
|
||||
```
|
||||
|
||||
And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
Or for a default accelerate configuration without answering questions about your environment
|
||||
|
||||
```bash
|
||||
accelerate config default
|
||||
```
|
||||
|
||||
Or if your environment doesn't support an interactive shell (e.g., a notebook)
|
||||
|
||||
```python
|
||||
from accelerate.utils import write_basic_config
|
||||
write_basic_config()
|
||||
```
|
||||
|
||||
When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
|
||||
Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
|
||||
|
||||
|
||||
### Dog toy example
|
||||
|
||||
Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
|
||||
|
||||
Let's first download it locally:
|
||||
|
||||
```python
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
local_dir = "./dog"
|
||||
snapshot_download(
|
||||
"diffusers/dog-example",
|
||||
local_dir=local_dir, repo_type="dataset",
|
||||
ignore_patterns=".gitattributes",
|
||||
)
|
||||
```
|
||||
|
||||
This will also allow us to push the trained LoRA parameters to the Hugging Face Hub platform.
|
||||
|
||||
Now, we can launch training using:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="stabilityai/stable-diffusion-3-medium-diffusers"
|
||||
export INSTANCE_DIR="dog"
|
||||
export OUTPUT_DIR="trained-sd3"
|
||||
|
||||
accelerate launch train_dreambooth_sd3.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--mixed_precision="fp16" \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=1024 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--learning_rate=1e-4 \
|
||||
--report_to="wandb" \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--max_train_steps=500 \
|
||||
--validation_prompt="A photo of sks dog in a bucket" \
|
||||
--validation_epochs=25 \
|
||||
--seed="0" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
To better track our training experiments, we're using the following flags in the command above:
|
||||
|
||||
* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
|
||||
* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
|
||||
|
||||
> [!TIP]
|
||||
> You can pass `--use_8bit_adam` to reduce the memory requirements of training. Make sure to install `bitsandbytes` if you want to do so.
|
||||
|
||||
## LoRA + DreamBooth
|
||||
|
||||
[LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a popular parameter-efficient fine-tuning technique that allows you to achieve full-finetuning like performance but with a fraction of learnable parameters.
|
||||
|
||||
To perform DreamBooth with LoRA, run:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="stabilityai/stable-diffusion-3-medium-diffusers"
|
||||
export INSTANCE_DIR="dog"
|
||||
export OUTPUT_DIR="trained-sd3-lora"
|
||||
|
||||
accelerate launch train_dreambooth_lora_sd3.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--mixed_precision="fp16" \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--learning_rate=1e-5 \
|
||||
--report_to="wandb" \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--max_train_steps=500 \
|
||||
--validation_prompt="A photo of sks dog in a bucket" \
|
||||
--validation_epochs=25 \
|
||||
--seed="0" \
|
||||
--push_to_hub
|
||||
```
|
||||
7
examples/dreambooth/requirements_sd3.txt
Normal file
7
examples/dreambooth/requirements_sd3.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
accelerate>=0.31.0
|
||||
torchvision
|
||||
transformers>=4.41.2
|
||||
ftfy
|
||||
tensorboard
|
||||
Jinja2
|
||||
peft== 0.11.1
|
||||
@@ -63,7 +63,7 @@ if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -742,7 +742,7 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
@@ -759,7 +759,7 @@ class PromptDataset(Dataset):
|
||||
|
||||
|
||||
def model_has_vae(args):
|
||||
config_file_name = os.path.join("vae", AutoencoderKL.config_name)
|
||||
config_file_name = Path("vae", AutoencoderKL.config_name).as_posix()
|
||||
if os.path.isdir(args.pretrained_model_name_or_path):
|
||||
config_file_name = os.path.join(args.pretrained_model_name_or_path, config_file_name)
|
||||
return os.path.isfile(config_file_name)
|
||||
|
||||
@@ -35,7 +35,7 @@ from diffusers.utils import check_min_version
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.29.0.dev0")
|
||||
|
||||
# Cache compiled models across invocations of this script.
|
||||
cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
|
||||
@@ -301,7 +301,7 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
"""A simple dataset to prepare the prompts to generate class images on multiple GPUs."""
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user