mirror of
https://github.com/huggingface/diffusers.git
synced 2025-12-09 22:14:43 +08:00
Compare commits
4 Commits
str-to-boo
...
single-fil
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
57afaff270 | ||
|
|
e7b2032082 | ||
|
|
2d5e9c2e39 | ||
|
|
d68635f950 |
38
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
38
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
@@ -66,32 +66,32 @@ body:
|
||||
Questions on DiffusionPipeline (Saving, Loading, From pretrained, ...):
|
||||
|
||||
Questions on pipelines:
|
||||
- Stable Diffusion @yiyixuxu @DN6 @sayakpaul
|
||||
- Stable Diffusion XL @yiyixuxu @sayakpaul @DN6
|
||||
- Kandinsky @yiyixuxu
|
||||
- ControlNet @sayakpaul @yiyixuxu @DN6
|
||||
- T2I Adapter @sayakpaul @yiyixuxu @DN6
|
||||
- IF @DN6
|
||||
- Text-to-Video / Video-to-Video @DN6 @sayakpaul
|
||||
- Wuerstchen @DN6
|
||||
- Stable Diffusion @yiyixuxu @DN6 @sayakpaul @patrickvonplaten
|
||||
- Stable Diffusion XL @yiyixuxu @sayakpaul @DN6 @patrickvonplaten
|
||||
- Kandinsky @yiyixuxu @patrickvonplaten
|
||||
- ControlNet @sayakpaul @yiyixuxu @DN6 @patrickvonplaten
|
||||
- T2I Adapter @sayakpaul @yiyixuxu @DN6 @patrickvonplaten
|
||||
- IF @DN6 @patrickvonplaten
|
||||
- Text-to-Video / Video-to-Video @DN6 @sayakpaul @patrickvonplaten
|
||||
- Wuerstchen @DN6 @patrickvonplaten
|
||||
- Other: @yiyixuxu @DN6
|
||||
|
||||
Questions on models:
|
||||
- UNet @DN6 @yiyixuxu @sayakpaul
|
||||
- VAE @sayakpaul @DN6 @yiyixuxu
|
||||
- Transformers/Attention @DN6 @yiyixuxu @sayakpaul @DN6
|
||||
- UNet @DN6 @yiyixuxu @sayakpaul @patrickvonplaten
|
||||
- VAE @sayakpaul @DN6 @yiyixuxu @patrickvonplaten
|
||||
- Transformers/Attention @DN6 @yiyixuxu @sayakpaul @DN6 @patrickvonplaten
|
||||
|
||||
Questions on Schedulers: @yiyixuxu
|
||||
Questions on Schedulers: @yiyixuxu @patrickvonplaten
|
||||
|
||||
Questions on LoRA: @sayakpaul
|
||||
Questions on LoRA: @sayakpaul @patrickvonplaten
|
||||
|
||||
Questions on Textual Inversion: @sayakpaul
|
||||
Questions on Textual Inversion: @sayakpaul @patrickvonplaten
|
||||
|
||||
Questions on Training:
|
||||
- DreamBooth @sayakpaul
|
||||
- Text-to-Image Fine-tuning @sayakpaul
|
||||
- Textual Inversion @sayakpaul
|
||||
- ControlNet @sayakpaul
|
||||
- DreamBooth @sayakpaul @patrickvonplaten
|
||||
- Text-to-Image Fine-tuning @sayakpaul @patrickvonplaten
|
||||
- Textual Inversion @sayakpaul @patrickvonplaten
|
||||
- ControlNet @sayakpaul @patrickvonplaten
|
||||
|
||||
Questions on Tests: @DN6 @sayakpaul @yiyixuxu
|
||||
|
||||
@@ -99,7 +99,7 @@ body:
|
||||
|
||||
Questions on JAX- and MPS-related things: @pcuenca
|
||||
|
||||
Questions on audio pipelines: @DN6
|
||||
Questions on audio pipelines: @DN6 @patrickvonplaten
|
||||
|
||||
|
||||
|
||||
|
||||
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
@@ -38,13 +38,13 @@ members/contributors who may be interested in your PR.
|
||||
|
||||
Core library:
|
||||
|
||||
- Schedulers: @yiyixuxu
|
||||
- Pipelines: @sayakpaul @yiyixuxu @DN6
|
||||
- Training examples: @sayakpaul
|
||||
- Docs: @stevhliu and @sayakpaul
|
||||
- Schedulers: @yiyixuxu and @patrickvonplaten
|
||||
- Pipelines: @patrickvonplaten and @sayakpaul
|
||||
- Training examples: @sayakpaul and @patrickvonplaten
|
||||
- Docs: @stevhliu and @yiyixuxu
|
||||
- JAX and MPS: @pcuenca
|
||||
- Audio: @sanchit-gandhi
|
||||
- General functionalities: @sayakpaul @yiyixuxu @DN6
|
||||
- General functionalities: @patrickvonplaten and @sayakpaul
|
||||
|
||||
Integrations:
|
||||
|
||||
|
||||
6
.github/workflows/benchmark.yml
vendored
6
.github/workflows/benchmark.yml
vendored
@@ -1,7 +1,6 @@
|
||||
name: Benchmarking tests
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM
|
||||
|
||||
@@ -32,9 +31,8 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install pandas peft
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install pandas
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
69
.github/workflows/build_docker_images.yml
vendored
69
.github/workflows/build_docker_images.yml
vendored
@@ -1,58 +1,21 @@
|
||||
name: Test, build, and push Docker images
|
||||
name: Build Docker images (nightly)
|
||||
|
||||
on:
|
||||
pull_request: # During PRs, we just check if the changes Dockerfiles can be successfully built
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "docker/**"
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 0 * * *" # every day at midnight
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
group: docker-image-builds
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
REGISTRY: diffusers
|
||||
CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }}
|
||||
|
||||
jobs:
|
||||
test-build-docker-images:
|
||||
build-docker-images:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'pull_request'
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Find Changed Dockerfiles
|
||||
id: file_changes
|
||||
uses: jitterbit/get-changed-files@v1
|
||||
with:
|
||||
format: 'space-delimited'
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Build Changed Docker Images
|
||||
run: |
|
||||
CHANGED_FILES="${{ steps.file_changes.outputs.all }}"
|
||||
for FILE in $CHANGED_FILES; do
|
||||
if [[ "$FILE" == docker/*Dockerfile ]]; then
|
||||
DOCKER_PATH="${FILE%/Dockerfile}"
|
||||
DOCKER_TAG=$(basename "$DOCKER_PATH")
|
||||
echo "Building Docker image for $DOCKER_TAG"
|
||||
docker build -t "$DOCKER_TAG" "$DOCKER_PATH"
|
||||
fi
|
||||
done
|
||||
if: steps.file_changes.outputs.all != ''
|
||||
|
||||
build-and-push-docker-images:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'pull_request'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
@@ -87,27 +50,3 @@ jobs:
|
||||
context: ./docker/${{ matrix.image-name }}
|
||||
push: true
|
||||
tags: ${{ env.REGISTRY }}/${{ matrix.image-name }}:latest
|
||||
|
||||
- name: Post to a Slack channel
|
||||
id: slack
|
||||
uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
|
||||
with:
|
||||
# Slack channel id, channel name, or user id to post message.
|
||||
# See also: https://api.slack.com/methods/chat.postMessage#channels
|
||||
channel-id: ${{ env.CI_SLACK_CHANNEL }}
|
||||
# For posting a rich message using Block Kit
|
||||
payload: |
|
||||
{
|
||||
"text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}",
|
||||
"blocks": [
|
||||
{
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
"text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
4
.github/workflows/build_documentation.yml
vendored
4
.github/workflows/build_documentation.yml
vendored
@@ -7,10 +7,6 @@ on:
|
||||
- doc-builder*
|
||||
- v*-release
|
||||
- v*-patch
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "examples/**"
|
||||
- "docs/**"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
4
.github/workflows/build_pr_documentation.yml
vendored
4
.github/workflows/build_pr_documentation.yml
vendored
@@ -2,10 +2,6 @@ name: Build PR Documentation
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "examples/**"
|
||||
- "docs/**"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
|
||||
42
.github/workflows/nightly_tests.yml
vendored
42
.github/workflows/nightly_tests.yml
vendored
@@ -12,7 +12,6 @@ env:
|
||||
PYTEST_TIMEOUT: 600
|
||||
RUN_SLOW: yes
|
||||
RUN_NIGHTLY: yes
|
||||
SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
jobs:
|
||||
run_nightly_tests:
|
||||
@@ -61,11 +60,9 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
|
||||
python -m uv pip install pytest-reportlog
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install -U git+https://github.com/huggingface/transformers
|
||||
python -m pip install git+https://github.com/huggingface/accelerate
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -76,23 +73,19 @@ jobs:
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
--report-log=${{ matrix.config.report }}.log \
|
||||
tests/
|
||||
tests/
|
||||
|
||||
- name: Run nightly Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
--report-log=${{ matrix.config.report }}.log \
|
||||
tests/
|
||||
|
||||
- name: Run nightly ONNXRuntime CUDA tests
|
||||
@@ -100,11 +93,9 @@ jobs:
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
--report-log=${{ matrix.config.report }}.log \
|
||||
tests/
|
||||
|
||||
- name: Failure short reports
|
||||
@@ -117,12 +108,6 @@ jobs:
|
||||
with:
|
||||
name: ${{ matrix.config.report }}_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_nightly_tests_apple_m1:
|
||||
name: Nightly PyTorch MPS tests on MacOS
|
||||
@@ -147,11 +132,10 @@ jobs:
|
||||
- name: Install dependencies
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pip install --upgrade pip uv
|
||||
${CONDA_RUN} python -m uv pip install -e [quality,test]
|
||||
${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
|
||||
${CONDA_RUN} python -m uv pip install pytest-reportlog
|
||||
${CONDA_RUN} python -m pip install --upgrade pip
|
||||
${CONDA_RUN} python -m pip install -e .[quality,test]
|
||||
${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
|
||||
|
||||
- name: Environment
|
||||
shell: arch -arch arm64 bash {0}
|
||||
@@ -164,9 +148,7 @@ jobs:
|
||||
HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
|
||||
--report-log=tests_torch_mps.log \
|
||||
tests/
|
||||
${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
@@ -178,9 +160,3 @@ jobs:
|
||||
with:
|
||||
name: torch_mps_test_reports
|
||||
path: reports
|
||||
|
||||
- name: Generate Report and Notify Channel
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
23
.github/workflows/notify_slack_about_release.yml
vendored
23
.github/workflows/notify_slack_about_release.yml
vendored
@@ -1,23 +0,0 @@
|
||||
name: Notify Slack about a release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
|
||||
- name: Notify Slack about the release
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
|
||||
run: pip install requests && python utils/notify_slack_about_release.py
|
||||
10
.github/workflows/pr_dependency_test.yml
vendored
10
.github/workflows/pr_dependency_test.yml
vendored
@@ -4,8 +4,6 @@ on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
@@ -25,12 +23,10 @@ jobs:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install --upgrade pip uv
|
||||
python -m uv pip install -e .
|
||||
python -m uv pip install pytest
|
||||
python -m pip install --upgrade pip
|
||||
pip install -e .
|
||||
pip install pytest
|
||||
- name: Check for soft dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
pytest tests/others/test_dependencies.py
|
||||
|
||||
16
.github/workflows/pr_flax_dependency_test.yml
vendored
16
.github/workflows/pr_flax_dependency_test.yml
vendored
@@ -4,8 +4,6 @@ on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
@@ -25,14 +23,12 @@ jobs:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install --upgrade pip uv
|
||||
python -m uv pip install -e .
|
||||
python -m uv pip install "jax[cpu]>=0.2.16,!=0.3.2"
|
||||
python -m uv pip install "flax>=0.4.1"
|
||||
python -m uv pip install "jaxlib>=0.1.65"
|
||||
python -m uv pip install pytest
|
||||
python -m pip install --upgrade pip
|
||||
pip install -e .
|
||||
pip install "jax[cpu]>=0.2.16,!=0.3.2"
|
||||
pip install "flax>=0.4.1"
|
||||
pip install "jaxlib>=0.1.65"
|
||||
pip install pytest
|
||||
- name: Check for soft dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
pytest tests/others/test_dependencies.py
|
||||
|
||||
49
.github/workflows/pr_quality.yml
vendored
Normal file
49
.github/workflows/pr_quality.yml
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
name: Run code quality checks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
check_code_quality:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
ruff check examples tests src utils scripts
|
||||
ruff format examples tests src utils scripts --check
|
||||
|
||||
check_repository_consistency:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
13
.github/workflows/pr_test_fetcher.yml
vendored
13
.github/workflows/pr_test_fetcher.yml
vendored
@@ -33,8 +33,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m pip install -e .[quality,test]
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -90,18 +89,15 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install -e [quality,test]
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install accelerate
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run all selected tests on CPU
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.modules }}_tests_cpu ${{ fromJson(needs.setup_pr_tests.outputs.test_map)[matrix.modules] }}
|
||||
|
||||
- name: Failure short reports
|
||||
@@ -148,18 +144,15 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install -e [quality,test]
|
||||
python -m pip install -e .[quality,test]
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run Hub tests for models, schedulers, and pipelines on a staging env
|
||||
if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
HUGGINGFACE_CO_STAGING=true python -m pytest \
|
||||
-m "is_staging_test" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
|
||||
55
.github/workflows/pr_test_peft_backend.yml
vendored
55
.github/workflows/pr_test_peft_backend.yml
vendored
@@ -4,9 +4,6 @@ on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "tests/**.py"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -19,44 +16,7 @@ env:
|
||||
PYTEST_TIMEOUT: 60
|
||||
|
||||
jobs:
|
||||
check_code_quality:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
ruff check examples tests src utils scripts
|
||||
ruff format examples tests src utils scripts --check
|
||||
|
||||
check_repository_consistency:
|
||||
needs: check_code_quality
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
|
||||
run_fast_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -84,25 +44,22 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m pip install -e .[quality,test]
|
||||
if [ "${{ matrix.lib-versions }}" == "main" ]; then
|
||||
python -m uv pip install -U peft@git+https://github.com/huggingface/peft.git
|
||||
python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m pip install -U git+https://github.com/huggingface/peft.git
|
||||
python -m pip install -U git+https://github.com/huggingface/transformers.git
|
||||
python -m pip install -U git+https://github.com/huggingface/accelerate.git
|
||||
else
|
||||
python -m uv pip install -U peft transformers accelerate
|
||||
python -m pip install -U peft transformers accelerate
|
||||
fi
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch LoRA CPU tests with PEFT backend
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/lora/
|
||||
tests/lora/test_lora_layers_peft.py
|
||||
|
||||
63
.github/workflows/pr_tests.yml
vendored
63
.github/workflows/pr_tests.yml
vendored
@@ -4,14 +4,6 @@ on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "benchmarks/**.py"
|
||||
- "examples/**.py"
|
||||
- "scripts/**.py"
|
||||
- "tests/**.py"
|
||||
- ".github/**.yml"
|
||||
- "utils/**.py"
|
||||
push:
|
||||
branches:
|
||||
- ci-*
|
||||
@@ -27,44 +19,7 @@ env:
|
||||
PYTEST_TIMEOUT: 60
|
||||
|
||||
jobs:
|
||||
check_code_quality:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
ruff check examples tests src utils scripts
|
||||
ruff format examples tests src utils scripts --check
|
||||
|
||||
check_repository_consistency:
|
||||
needs: check_code_quality
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
make deps_table_check_updated
|
||||
|
||||
run_fast_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -111,19 +66,16 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install accelerate
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch Pipeline CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
@@ -132,7 +84,6 @@ jobs:
|
||||
- name: Run fast PyTorch Model Scheduler CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_models' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx and not Dependency" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
@@ -141,7 +92,6 @@ jobs:
|
||||
- name: Run fast Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
@@ -150,8 +100,7 @@ jobs:
|
||||
- name: Run example PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install peft
|
||||
python -m pip install peft
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples
|
||||
@@ -168,7 +117,6 @@ jobs:
|
||||
path: reports
|
||||
|
||||
run_staging_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -200,18 +148,15 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m pip install -e .[quality,test]
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run Hub tests for models, schedulers, and pipelines on a staging env
|
||||
if: ${{ matrix.config.framework == 'hub_tests_pytorch' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
HUGGINGFACE_CO_STAGING=true python -m pytest \
|
||||
-m "is_staging_test" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
|
||||
12
.github/workflows/pr_torch_dependency_test.yml
vendored
12
.github/workflows/pr_torch_dependency_test.yml
vendored
@@ -4,8 +4,6 @@ on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
@@ -25,12 +23,10 @@ jobs:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install --upgrade pip uv
|
||||
python -m uv pip install -e .
|
||||
python -m uv pip install torch torchvision torchaudio
|
||||
python -m uv pip install pytest
|
||||
python -m pip install --upgrade pip
|
||||
pip install -e .
|
||||
pip install torch torchvision torchaudio
|
||||
pip install pytest
|
||||
- name: Check for soft dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
pytest tests/others/test_dependencies.py
|
||||
|
||||
66
.github/workflows/push_tests.yml
vendored
66
.github/workflows/push_tests.yml
vendored
@@ -4,10 +4,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "examples/**.py"
|
||||
- "tests/**.py"
|
||||
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
@@ -21,7 +18,10 @@ env:
|
||||
jobs:
|
||||
setup_torch_cuda_pipeline_matrix:
|
||||
name: Setup Torch Pipelines CUDA Slow Tests Matrix
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: docker-gpu
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
|
||||
options: --shm-size "16gb" --ipc host
|
||||
outputs:
|
||||
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
|
||||
steps:
|
||||
@@ -29,20 +29,23 @@ jobs:
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
pip install huggingface_hub
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Fetch Pipeline Matrix
|
||||
id: fetch_pipeline_matrix
|
||||
run: |
|
||||
matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
|
||||
echo $matrix
|
||||
echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Pipeline Tests Artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
@@ -55,9 +58,10 @@ jobs:
|
||||
needs: setup_torch_cuda_pipeline_matrix
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on: docker-gpu
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
|
||||
@@ -72,9 +76,8 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -122,9 +125,8 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -172,10 +174,9 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
python -m pip install git+https://github.com/huggingface/peft.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -223,9 +224,8 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -271,9 +271,8 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -321,8 +320,7 @@ jobs:
|
||||
nvidia-smi
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test,training]
|
||||
python -m pip install -e .[quality,test,training]
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -362,8 +360,7 @@ jobs:
|
||||
nvidia-smi
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test,training]
|
||||
python -m pip install -e .[quality,test,training]
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -404,19 +401,16 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test,training]
|
||||
python -m pip install -e .[quality,test,training]
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run example tests on GPU
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
|
||||
- name: Failure short reports
|
||||
|
||||
14
.github/workflows/push_tests_fast.yml
vendored
14
.github/workflows/push_tests_fast.yml
vendored
@@ -4,10 +4,6 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "examples/**.py"
|
||||
- "tests/**.py"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -69,18 +65,15 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install libsndfile1-dev libgl1 -y
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install -e [quality,test]
|
||||
python -m pip install -e .[quality,test]
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
@@ -89,7 +82,6 @@ jobs:
|
||||
- name: Run fast Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
@@ -98,7 +90,6 @@ jobs:
|
||||
- name: Run fast ONNXRuntime CPU tests
|
||||
if: ${{ matrix.config.framework == 'onnxruntime' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
@@ -107,8 +98,7 @@ jobs:
|
||||
- name: Run example PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch_examples' }}
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m uv pip install peft
|
||||
python -m pip install peft
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
examples
|
||||
|
||||
13
.github/workflows/push_tests_mps.yml
vendored
13
.github/workflows/push_tests_mps.yml
vendored
@@ -4,9 +4,6 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
- "tests/**.py"
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
@@ -44,11 +41,11 @@ jobs:
|
||||
- name: Install dependencies
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pip install --upgrade pip uv
|
||||
${CONDA_RUN} python -m uv pip install -e [quality,test]
|
||||
${CONDA_RUN} python -m uv pip install torch torchvision torchaudio
|
||||
${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
${CONDA_RUN} python -m uv pip install transformers --upgrade
|
||||
${CONDA_RUN} python -m pip install --upgrade pip
|
||||
${CONDA_RUN} python -m pip install -e .[quality,test]
|
||||
${CONDA_RUN} python -m pip install torch torchvision torchaudio
|
||||
${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate.git
|
||||
${CONDA_RUN} python -m pip install transformers --upgrade
|
||||
|
||||
- name: Environment
|
||||
shell: arch -arch arm64 bash {0}
|
||||
|
||||
81
.github/workflows/pypi_publish.yaml
vendored
81
.github/workflows/pypi_publish.yaml
vendored
@@ -1,81 +0,0 @@
|
||||
# Adapted from https://blog.deepjyoti30.dev/pypi-release-github-action
|
||||
|
||||
name: PyPI release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
tags:
|
||||
- "*"
|
||||
|
||||
jobs:
|
||||
find-and-checkout-latest-branch:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
latest_branch: ${{ steps.set_latest_branch.outputs.latest_branch }}
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
|
||||
- name: Fetch latest branch
|
||||
id: fetch_latest_branch
|
||||
run: |
|
||||
pip install -U requests packaging
|
||||
LATEST_BRANCH=$(python utils/fetch_latest_release_branch.py)
|
||||
echo "Latest branch: $LATEST_BRANCH"
|
||||
echo "latest_branch=$LATEST_BRANCH" >> $GITHUB_ENV
|
||||
|
||||
- name: Set latest branch output
|
||||
id: set_latest_branch
|
||||
run: echo "::set-output name=latest_branch::${{ env.latest_branch }}"
|
||||
|
||||
release:
|
||||
needs: find-and-checkout-latest-branch
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ needs.find-and-checkout-latest-branch.outputs.latest_branch }}
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -U setuptools wheel twine
|
||||
pip install -U torch --index-url https://download.pytorch.org/whl/cpu
|
||||
pip install -U transformers
|
||||
|
||||
- name: Build the dist files
|
||||
run: python setup.py bdist_wheel && python setup.py sdist
|
||||
|
||||
- name: Publish to the test PyPI
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
|
||||
run: twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
|
||||
|
||||
- name: Test installing diffusers and importing
|
||||
run: |
|
||||
pip install diffusers && pip uninstall diffusers -y
|
||||
pip install -i https://testpypi.python.org/pypi diffusers
|
||||
python -c "from diffusers import __version__; print(__version__)"
|
||||
python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('fusing/unet-ldm-dummy-update'); pipe()"
|
||||
python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('hf-internal-testing/tiny-stable-diffusion-pipe', safety_checker=None); pipe('ah suh du')"
|
||||
python -c "from diffusers import *"
|
||||
|
||||
- name: Publish to PyPI
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
run: twine upload dist/* -r pypi
|
||||
10
CITATION.cff
10
CITATION.cff
@@ -19,16 +19,6 @@ authors:
|
||||
family-names: Rasul
|
||||
- given-names: Mishig
|
||||
family-names: Davaadorj
|
||||
- given-names: Dhruv
|
||||
family-names: Nair
|
||||
- given-names: Sayak
|
||||
family-names: Paul
|
||||
- given-names: Steven
|
||||
family-names: Liu
|
||||
- given-names: William
|
||||
family-names: Berman
|
||||
- given-names: Yiyi
|
||||
family-names: Xu
|
||||
- given-names: Thomas
|
||||
family-names: Wolf
|
||||
repository-code: 'https://github.com/huggingface/diffusers'
|
||||
|
||||
@@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi
|
||||
|
||||
## Quickstart
|
||||
|
||||
Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 22000+ checkpoints):
|
||||
Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 19000+ checkpoints):
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
|
||||
- https://github.com/deep-floyd/IF
|
||||
- https://github.com/bentoml/BentoML
|
||||
- https://github.com/bmaltais/kohya_ss
|
||||
- +9000 other amazing GitHub repositories 💪
|
||||
- +8000 other amazing GitHub repositories 💪
|
||||
|
||||
Thank you for using us ❤️.
|
||||
|
||||
@@ -238,7 +238,7 @@ We also want to thank @heejkoo for the very helpful overview of papers, code and
|
||||
|
||||
```bibtex
|
||||
@misc{von-platen-etal-2022-diffusers,
|
||||
author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Dhruv Nair and Sayak Paul and William Berman and Yiyi Xu and Steven Liu and Thomas Wolf},
|
||||
author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Thomas Wolf},
|
||||
title = {Diffusers: State-of-the-art diffusion models},
|
||||
year = {2022},
|
||||
publisher = {GitHub},
|
||||
|
||||
@@ -141,7 +141,6 @@ class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
|
||||
super().__init__(args)
|
||||
self.pipe.load_lora_weights(self.lora_id)
|
||||
self.pipe.fuse_lora()
|
||||
self.pipe.unload_lora_weights()
|
||||
self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
|
||||
|
||||
def get_result_filepath(self, args):
|
||||
@@ -236,35 +235,6 @@ class InpaintingBenchmark(ImageToImageBenchmark):
|
||||
)
|
||||
|
||||
|
||||
class IPAdapterTextToImageBenchmark(TextToImageBenchmark):
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png"
|
||||
image = load_image(url)
|
||||
|
||||
def __init__(self, args):
|
||||
pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda")
|
||||
pipe.load_ip_adapter(
|
||||
args.ip_adapter_id[0],
|
||||
subfolder="models" if "sdxl" not in args.ip_adapter_id[1] else "sdxl_models",
|
||||
weight_name=args.ip_adapter_id[1],
|
||||
)
|
||||
|
||||
if args.run_compile:
|
||||
pipe.unet.to(memory_format=torch.channels_last)
|
||||
print("Run torch compile")
|
||||
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
pipe.set_progress_bar_config(disable=True)
|
||||
self.pipe = pipe
|
||||
|
||||
def run_inference(self, pipe, args):
|
||||
_ = pipe(
|
||||
prompt=PROMPT,
|
||||
ip_adapter_image=self.image,
|
||||
num_inference_steps=args.num_inference_steps,
|
||||
num_images_per_prompt=args.batch_size,
|
||||
)
|
||||
|
||||
|
||||
class ControlNetBenchmark(TextToImageBenchmark):
|
||||
pipeline_class = StableDiffusionControlNetPipeline
|
||||
aux_network_class = ControlNetModel
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
|
||||
sys.path.append(".")
|
||||
from base_classes import IPAdapterTextToImageBenchmark # noqa: E402
|
||||
|
||||
|
||||
IP_ADAPTER_CKPTS = {
|
||||
"runwayml/stable-diffusion-v1-5": ("h94/IP-Adapter", "ip-adapter_sd15.bin"),
|
||||
"stabilityai/stable-diffusion-xl-base-1.0": ("h94/IP-Adapter", "ip-adapter_sdxl.bin"),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--ckpt",
|
||||
type=str,
|
||||
default="runwayml/stable-diffusion-v1-5",
|
||||
choices=list(IP_ADAPTER_CKPTS.keys()),
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=1)
|
||||
parser.add_argument("--num_inference_steps", type=int, default=50)
|
||||
parser.add_argument("--model_cpu_offload", action="store_true")
|
||||
parser.add_argument("--run_compile", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
args.ip_adapter_id = IP_ADAPTER_CKPTS[args.ckpt]
|
||||
benchmark_pipe = IPAdapterTextToImageBenchmark(args)
|
||||
args.ckpt = f"{args.ckpt} (IP-Adapter)"
|
||||
benchmark_pipe.benchmark(args)
|
||||
@@ -72,7 +72,7 @@ def main():
|
||||
command += " --run_compile"
|
||||
run_command(command.split())
|
||||
|
||||
elif file in ["benchmark_sd_inpainting.py", "benchmark_ip_adapters.py"]:
|
||||
elif file == "benchmark_sd_inpainting.py":
|
||||
sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
command = f"python {file} --ckpt {sdxl_ckpt}"
|
||||
run_command(command.split())
|
||||
|
||||
@@ -23,13 +23,13 @@ ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3 -m uv pip install --upgrade --no-cache-dir \
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --upgrade --no-cache-dir \
|
||||
clu \
|
||||
"jax[cpu]>=0.2.16,!=0.3.2" \
|
||||
"flax>=0.4.1" \
|
||||
"jaxlib>=0.1.65" && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -23,15 +23,15 @@ ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
"jax[tpu]>=0.2.16,!=0.3.2" \
|
||||
-f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
|
||||
python3 -m uv pip install --upgrade --no-cache-dir \
|
||||
python3 -m pip install --upgrade --no-cache-dir \
|
||||
clu \
|
||||
"flax>=0.4.1" \
|
||||
"jaxlib>=0.1.65" && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -22,14 +22,14 @@ RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
torch==2.1.2 \
|
||||
torchvision==0.16.2 \
|
||||
torchaudio==2.1.2 \
|
||||
onnxruntime \
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
|
||||
FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
@@ -22,14 +22,14 @@ RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
torch==2.1.2 \
|
||||
torchvision==0.16.2 \
|
||||
torchaudio==2.1.2 \
|
||||
"onnxruntime-gpu>=1.13.1" \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu117 && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -24,8 +24,8 @@ RUN python3.9 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3.9 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3.9 -m uv pip install --no-cache-dir \
|
||||
RUN python3.9 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3.9 -m pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
|
||||
@@ -23,14 +23,14 @@ RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
invisible_watermark \
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
@@ -40,6 +40,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers matplotlib
|
||||
transformers
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -23,8 +23,8 @@ RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
|
||||
@@ -23,13 +23,13 @@ RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
invisible_watermark && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
- local: tutorials/basic_training
|
||||
title: Train a diffusion model
|
||||
- local: tutorials/using_peft_for_inference
|
||||
title: Load LoRAs for inference
|
||||
title: Inference with PEFT
|
||||
- local: tutorials/fast_diffusion
|
||||
title: Accelerate inference of text-to-image diffusion models
|
||||
title: Tutorials
|
||||
@@ -52,18 +52,12 @@
|
||||
title: Image-to-image
|
||||
- local: using-diffusers/inpaint
|
||||
title: Inpainting
|
||||
- local: using-diffusers/text-img2vid
|
||||
title: Text or image-to-video
|
||||
- local: using-diffusers/depth2img
|
||||
title: Depth-to-image
|
||||
title: Tasks
|
||||
- sections:
|
||||
- local: using-diffusers/textual_inversion_inference
|
||||
title: Textual inversion
|
||||
- local: using-diffusers/ip_adapter
|
||||
title: IP-Adapter
|
||||
- local: using-diffusers/merge_loras
|
||||
title: Merge LoRAs
|
||||
- local: training/distributed_inference
|
||||
title: Distributed inference with multiple GPUs
|
||||
- local: using-diffusers/reusing_seeds
|
||||
@@ -104,8 +98,6 @@
|
||||
title: Latent Consistency Model-LoRA
|
||||
- local: using-diffusers/inference_with_lcm
|
||||
title: Latent Consistency Model
|
||||
- local: using-diffusers/inference_with_tcd_lora
|
||||
title: Trajectory Consistency Distillation-LoRA
|
||||
- local: using-diffusers/svd
|
||||
title: Stable Video Diffusion
|
||||
title: Specific pipeline examples
|
||||
@@ -306,8 +298,6 @@
|
||||
title: Latent Consistency Models
|
||||
- local: api/pipelines/latent_diffusion
|
||||
title: Latent Diffusion
|
||||
- local: api/pipelines/ledits_pp
|
||||
title: LEDITS++
|
||||
- local: api/pipelines/panorama
|
||||
title: MultiDiffusion
|
||||
- local: api/pipelines/musicldm
|
||||
@@ -324,8 +314,6 @@
|
||||
title: Semantic Guidance
|
||||
- local: api/pipelines/shap_e
|
||||
title: Shap-E
|
||||
- local: api/pipelines/stable_cascade
|
||||
title: Stable Cascade
|
||||
- sections:
|
||||
- local: api/pipelines/stable_diffusion/overview
|
||||
title: Overview
|
||||
@@ -333,8 +321,6 @@
|
||||
title: Text-to-image
|
||||
- local: api/pipelines/stable_diffusion/img2img
|
||||
title: Image-to-image
|
||||
- local: api/pipelines/stable_diffusion/svd
|
||||
title: Image-to-video
|
||||
- local: api/pipelines/stable_diffusion/inpaint
|
||||
title: Inpainting
|
||||
- local: api/pipelines/stable_diffusion/depth2img
|
||||
@@ -400,10 +386,6 @@
|
||||
title: DPMSolverSDEScheduler
|
||||
- local: api/schedulers/singlestep_dpm_solver
|
||||
title: DPMSolverSinglestepScheduler
|
||||
- local: api/schedulers/edm_multistep_dpm_solver
|
||||
title: EDMDPMSolverMultistepScheduler
|
||||
- local: api/schedulers/edm_euler
|
||||
title: EDMEulerScheduler
|
||||
- local: api/schedulers/euler_ancestral
|
||||
title: EulerAncestralDiscreteScheduler
|
||||
- local: api/schedulers/euler
|
||||
@@ -430,8 +412,6 @@
|
||||
title: ScoreSdeVeScheduler
|
||||
- local: api/schedulers/score_sde_vp
|
||||
title: ScoreSdeVpScheduler
|
||||
- local: api/schedulers/tcd
|
||||
title: TCDScheduler
|
||||
- local: api/schedulers/unipc
|
||||
title: UniPCMultistepScheduler
|
||||
- local: api/schedulers/vq_diffusion
|
||||
|
||||
@@ -20,14 +20,14 @@ An attention processor is a class for applying different types of attention mech
|
||||
## AttnProcessor2_0
|
||||
[[autodoc]] models.attention_processor.AttnProcessor2_0
|
||||
|
||||
## AttnAddedKVProcessor
|
||||
[[autodoc]] models.attention_processor.AttnAddedKVProcessor
|
||||
## FusedAttnProcessor2_0
|
||||
[[autodoc]] models.attention_processor.FusedAttnProcessor2_0
|
||||
|
||||
## AttnAddedKVProcessor2_0
|
||||
[[autodoc]] models.attention_processor.AttnAddedKVProcessor2_0
|
||||
## LoRAAttnProcessor
|
||||
[[autodoc]] models.attention_processor.LoRAAttnProcessor
|
||||
|
||||
## CrossFrameAttnProcessor
|
||||
[[autodoc]] pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor
|
||||
## LoRAAttnProcessor2_0
|
||||
[[autodoc]] models.attention_processor.LoRAAttnProcessor2_0
|
||||
|
||||
## CustomDiffusionAttnProcessor
|
||||
[[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor
|
||||
@@ -35,23 +35,26 @@ An attention processor is a class for applying different types of attention mech
|
||||
## CustomDiffusionAttnProcessor2_0
|
||||
[[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor2_0
|
||||
|
||||
## CustomDiffusionXFormersAttnProcessor
|
||||
[[autodoc]] models.attention_processor.CustomDiffusionXFormersAttnProcessor
|
||||
## AttnAddedKVProcessor
|
||||
[[autodoc]] models.attention_processor.AttnAddedKVProcessor
|
||||
|
||||
## FusedAttnProcessor2_0
|
||||
[[autodoc]] models.attention_processor.FusedAttnProcessor2_0
|
||||
## AttnAddedKVProcessor2_0
|
||||
[[autodoc]] models.attention_processor.AttnAddedKVProcessor2_0
|
||||
|
||||
## LoRAAttnAddedKVProcessor
|
||||
[[autodoc]] models.attention_processor.LoRAAttnAddedKVProcessor
|
||||
|
||||
## XFormersAttnProcessor
|
||||
[[autodoc]] models.attention_processor.XFormersAttnProcessor
|
||||
|
||||
## LoRAXFormersAttnProcessor
|
||||
[[autodoc]] models.attention_processor.LoRAXFormersAttnProcessor
|
||||
|
||||
## CustomDiffusionXFormersAttnProcessor
|
||||
[[autodoc]] models.attention_processor.CustomDiffusionXFormersAttnProcessor
|
||||
|
||||
## SlicedAttnProcessor
|
||||
[[autodoc]] models.attention_processor.SlicedAttnProcessor
|
||||
|
||||
## SlicedAttnAddedKVProcessor
|
||||
[[autodoc]] models.attention_processor.SlicedAttnAddedKVProcessor
|
||||
|
||||
## XFormersAttnProcessor
|
||||
[[autodoc]] models.attention_processor.XFormersAttnProcessor
|
||||
|
||||
@@ -12,18 +12,14 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# IP-Adapter
|
||||
|
||||
[IP-Adapter](https://hf.co/papers/2308.06721) is a lightweight adapter that enables prompting a diffusion model with an image. This method decouples the cross-attention layers of the image and text features. The image features are generated from an image encoder.
|
||||
[IP-Adapter](https://hf.co/papers/2308.06721) is a lightweight adapter that enables prompting a diffusion model with an image. This method decouples the cross-attention layers of the image and text features. The image features are generated from an image encoder. Files generated from IP-Adapter are only ~100MBs.
|
||||
|
||||
<Tip>
|
||||
|
||||
Learn how to load an IP-Adapter checkpoint and image in the IP-Adapter [loading](../../using-diffusers/loading_adapters#ip-adapter) guide, and you can see how to use it in the [usage](../../using-diffusers/ip_adapter) guide.
|
||||
Learn how to load an IP-Adapter checkpoint and image in the [IP-Adapter](../../using-diffusers/loading_adapters#ip-adapter) loading guide.
|
||||
|
||||
</Tip>
|
||||
|
||||
## IPAdapterMixin
|
||||
|
||||
[[autodoc]] loaders.ip_adapter.IPAdapterMixin
|
||||
|
||||
## IPAdapterMaskProcessor
|
||||
|
||||
[[autodoc]] image_processor.IPAdapterMaskProcessor
|
||||
@@ -1,18 +1,6 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Consistency Decoder
|
||||
|
||||
Consistency decoder can be used to decode the latents from the denoising UNet in the [`StableDiffusionPipeline`]. This decoder was introduced in the [DALL-E 3 technical report](https://openai.com/dall-e-3).
|
||||
Consistency decoder can be used to decode the latents from the denoising UNet in the [`StableDiffusionPipeline`]. This decoder was introduced in the [DALL-E 3 technical report](https://openai.com/dall-e-3).
|
||||
|
||||
The original codebase can be found at [openai/consistencydecoder](https://github.com/openai/consistencydecoder).
|
||||
|
||||
|
||||
@@ -408,114 +408,6 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
|
||||
|
||||
</Tip>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align=center>Without FreeInit enabled</th>
|
||||
<th align=center>With FreeInit enabled</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align=center>
|
||||
panda playing a guitar
|
||||
<br />
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-no-freeinit.gif"
|
||||
alt="panda playing a guitar"
|
||||
style="width: 300px;" />
|
||||
</td>
|
||||
<td align=center>
|
||||
panda playing a guitar
|
||||
<br/>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-freeinit.gif"
|
||||
alt="panda playing a guitar"
|
||||
style="width: 300px;" />
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## Using AnimateLCM
|
||||
|
||||
[AnimateLCM](https://animatelcm.github.io/) is a motion module checkpoint and an [LCM LoRA](https://huggingface.co/docs/diffusers/using-diffusers/inference_with_lcm_lora) that have been created using a consistency learning strategy that decouples the distillation of the image generation priors and the motion generation priors.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
|
||||
pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter)
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
|
||||
|
||||
pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="sd15_lora_beta.safetensors", adapter_name="lcm-lora")
|
||||
|
||||
pipe.enable_vae_slicing()
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
output = pipe(
|
||||
prompt="A space rocket with trails of smoke behind it launching into space from the desert, 4k, high resolution",
|
||||
negative_prompt="bad quality, worse quality, low resolution",
|
||||
num_frames=16,
|
||||
guidance_scale=1.5,
|
||||
num_inference_steps=6,
|
||||
generator=torch.Generator("cpu").manual_seed(0),
|
||||
)
|
||||
frames = output.frames[0]
|
||||
export_to_gif(frames, "animatelcm.gif")
|
||||
```
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><center>
|
||||
A space rocket, 4K.
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatelcm-output.gif"
|
||||
alt="A space rocket, 4K"
|
||||
style="width: 300px;" />
|
||||
</center></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
AnimateLCM is also compatible with existing [Motion LoRAs](https://huggingface.co/collections/dn6/animatediff-motion-loras-654cb8ad732b9e3cf4d3c17e).
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
|
||||
pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter)
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
|
||||
|
||||
pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="sd15_lora_beta.safetensors", adapter_name="lcm-lora")
|
||||
pipe.load_lora_weights("guoyww/animatediff-motion-lora-tilt-up", adapter_name="tilt-up")
|
||||
|
||||
pipe.set_adapters(["lcm-lora", "tilt-up"], [1.0, 0.8])
|
||||
pipe.enable_vae_slicing()
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
output = pipe(
|
||||
prompt="A space rocket with trails of smoke behind it launching into space from the desert, 4k, high resolution",
|
||||
negative_prompt="bad quality, worse quality, low resolution",
|
||||
num_frames=16,
|
||||
guidance_scale=1.5,
|
||||
num_inference_steps=6,
|
||||
generator=torch.Generator("cpu").manual_seed(0),
|
||||
)
|
||||
frames = output.frames[0]
|
||||
export_to_gif(frames, "animatelcm-motion-lora.gif")
|
||||
```
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><center>
|
||||
A space rocket, 4K.
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatelcm-motion-lora.gif"
|
||||
alt="A space rocket, 4K"
|
||||
style="width: 300px;" />
|
||||
</center></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
## AnimateDiffPipeline
|
||||
|
||||
[[autodoc]] AnimateDiffPipeline
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# LEDITS++
|
||||
|
||||
LEDITS++ was proposed in [LEDITS++: Limitless Image Editing using Text-to-Image Models](https://huggingface.co/papers/2311.16711) by Manuel Brack, Felix Friedrich, Katharina Kornmeier, Linoy Tsaban, Patrick Schramowski, Kristian Kersting, Apolinário Passos.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*Text-to-image diffusion models have recently received increasing interest for their astonishing ability to produce high-fidelity images from solely text inputs. Subsequent research efforts aim to exploit and apply their capabilities to real image editing. However, existing image-to-image methods are often inefficient, imprecise, and of limited versatility. They either require time-consuming fine-tuning, deviate unnecessarily strongly from the input image, and/or lack support for multiple, simultaneous edits. To address these issues, we introduce LEDITS++, an efficient yet versatile and precise textual image manipulation technique. LEDITS++'s novel inversion approach requires no tuning nor optimization and produces high-fidelity results with a few diffusion steps. Second, our methodology supports multiple simultaneous edits and is architecture-agnostic. Third, we use a novel implicit masking technique that limits changes to relevant image regions. We propose the novel TEdBench++ benchmark as part of our exhaustive evaluation. Our results demonstrate the capabilities of LEDITS++ and its improvements over previous methods. The project page is available at https://leditsplusplus-project.static.hf.space .*
|
||||
|
||||
<Tip>
|
||||
|
||||
You can find additional information about LEDITS++ on the [project page](https://leditsplusplus-project.static.hf.space/index.html) and try it out in a [demo](https://huggingface.co/spaces/editing-images/leditsplusplus).
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
Due to some backward compatability issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
|
||||
This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp).
|
||||
</Tip>
|
||||
|
||||
We provide two distinct pipelines based on different pre-trained models.
|
||||
|
||||
## LEditsPPPipelineStableDiffusion
|
||||
[[autodoc]] pipelines.ledits_pp.LEditsPPPipelineStableDiffusion
|
||||
- all
|
||||
- __call__
|
||||
- invert
|
||||
|
||||
## LEditsPPPipelineStableDiffusionXL
|
||||
[[autodoc]] pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL
|
||||
- all
|
||||
- __call__
|
||||
- invert
|
||||
|
||||
|
||||
|
||||
## LEditsPPDiffusionPipelineOutput
|
||||
[[autodoc]] pipelines.ledits_pp.pipeline_output.LEditsPPDiffusionPipelineOutput
|
||||
- all
|
||||
|
||||
## LEditsPPInversionPipelineOutput
|
||||
[[autodoc]] pipelines.ledits_pp.pipeline_output.LEditsPPInversionPipelineOutput
|
||||
- all
|
||||
@@ -57,7 +57,6 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
|
||||
| [Latent Consistency Models](latent_consistency_models) | text2image |
|
||||
| [Latent Diffusion](latent_diffusion) | text2image, super-resolution |
|
||||
| [LDM3D](stable_diffusion/ldm3d_diffusion) | text2image, text-to-3D, text-to-pano, upscaling |
|
||||
| [LEDITS++](ledits_pp) | image editing |
|
||||
| [MultiDiffusion](panorama) | text2image |
|
||||
| [MusicLDM](musicldm) | text2audio |
|
||||
| [Paint by Example](paint_by_example) | inpainting |
|
||||
|
||||
@@ -30,6 +30,6 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## SemanticStableDiffusionPipelineOutput
|
||||
## StableDiffusionSafePipelineOutput
|
||||
[[autodoc]] pipelines.semantic_stable_diffusion.pipeline_output.SemanticStableDiffusionPipelineOutput
|
||||
- all
|
||||
|
||||
@@ -1,229 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Stable Cascade
|
||||
|
||||
This model is built upon the [Würstchen](https://openreview.net/forum?id=gU58d5QeGv) architecture and its main
|
||||
difference to other models like Stable Diffusion is that it is working at a much smaller latent space. Why is this
|
||||
important? The smaller the latent space, the **faster** you can run inference and the **cheaper** the training becomes.
|
||||
How small is the latent space? Stable Diffusion uses a compression factor of 8, resulting in a 1024x1024 image being
|
||||
encoded to 128x128. Stable Cascade achieves a compression factor of 42, meaning that it is possible to encode a
|
||||
1024x1024 image to 24x24, while maintaining crisp reconstructions. The text-conditional model is then trained in the
|
||||
highly compressed latent space. Previous versions of this architecture, achieved a 16x cost reduction over Stable
|
||||
Diffusion 1.5.
|
||||
|
||||
Therefore, this kind of model is well suited for usages where efficiency is important. Furthermore, all known extensions
|
||||
like finetuning, LoRA, ControlNet, IP-Adapter, LCM etc. are possible with this method as well.
|
||||
|
||||
The original codebase can be found at [Stability-AI/StableCascade](https://github.com/Stability-AI/StableCascade).
|
||||
|
||||
## Model Overview
|
||||
Stable Cascade consists of three models: Stage A, Stage B and Stage C, representing a cascade to generate images,
|
||||
hence the name "Stable Cascade".
|
||||
|
||||
Stage A & B are used to compress images, similar to what the job of the VAE is in Stable Diffusion.
|
||||
However, with this setup, a much higher compression of images can be achieved. While the Stable Diffusion models use a
|
||||
spatial compression factor of 8, encoding an image with resolution of 1024 x 1024 to 128 x 128, Stable Cascade achieves
|
||||
a compression factor of 42. This encodes a 1024 x 1024 image to 24 x 24, while being able to accurately decode the
|
||||
image. This comes with the great benefit of cheaper training and inference. Furthermore, Stage C is responsible
|
||||
for generating the small 24 x 24 latents given a text prompt.
|
||||
|
||||
The Stage C model operates on the small 24 x 24 latents and denoises the latents conditioned on text prompts. The model is also the largest component in the Cascade pipeline and is meant to be used with the `StableCascadePriorPipeline`
|
||||
|
||||
The Stage B and Stage A models are used with the `StableCascadeDecoderPipeline` and are responsible for generating the final image given the small 24 x 24 latents.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
There are some restrictions on data types that can be used with the Stable Cascade models. The official checkpoints for the `StableCascadePriorPipeline` do not support the `torch.float16` data type. Please use `torch.bfloat16` instead.
|
||||
|
||||
In order to use the `torch.bfloat16` data type with the `StableCascadeDecoderPipeline` you need to have PyTorch 2.2.0 or higher installed. This also means that using the `StableCascadeCombinedPipeline` with `torch.bfloat16` requires PyTorch 2.2.0 or higher, since it calls the `StableCascadeDecoderPipeline` internally.
|
||||
|
||||
If it is not possible to install PyTorch 2.2.0 or higher in your environment, the `StableCascadeDecoderPipeline` can be used on its own with the `torch.float16` data type. You can download the full precision or `bf16` variant weights for the pipeline and cast the weights to `torch.float16`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Usage example
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline
|
||||
|
||||
prompt = "an image of a shiba inu, donning a spacesuit and helmet"
|
||||
negative_prompt = ""
|
||||
|
||||
prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=torch.bfloat16)
|
||||
decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.float16)
|
||||
|
||||
prior.enable_model_cpu_offload()
|
||||
prior_output = prior(
|
||||
prompt=prompt,
|
||||
height=1024,
|
||||
width=1024,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=4.0,
|
||||
num_images_per_prompt=1,
|
||||
num_inference_steps=20
|
||||
)
|
||||
|
||||
decoder.enable_model_cpu_offload()
|
||||
decoder_output = decoder(
|
||||
image_embeddings=prior_output.image_embeddings.to(torch.float16),
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=0.0,
|
||||
output_type="pil",
|
||||
num_inference_steps=10
|
||||
).images[0]
|
||||
decoder_output.save("cascade.png")
|
||||
```
|
||||
|
||||
## Using the Lite Versions of the Stage B and Stage C models
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import (
|
||||
StableCascadeDecoderPipeline,
|
||||
StableCascadePriorPipeline,
|
||||
StableCascadeUNet,
|
||||
)
|
||||
|
||||
prompt = "an image of a shiba inu, donning a spacesuit and helmet"
|
||||
negative_prompt = ""
|
||||
|
||||
prior_unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade-prior", subfolder="prior_lite")
|
||||
decoder_unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade", subfolder="decoder_lite")
|
||||
|
||||
prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", prior=prior_unet)
|
||||
decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", decoder=decoder_unet)
|
||||
|
||||
prior.enable_model_cpu_offload()
|
||||
prior_output = prior(
|
||||
prompt=prompt,
|
||||
height=1024,
|
||||
width=1024,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=4.0,
|
||||
num_images_per_prompt=1,
|
||||
num_inference_steps=20
|
||||
)
|
||||
|
||||
decoder.enable_model_cpu_offload()
|
||||
decoder_output = decoder(
|
||||
image_embeddings=prior_output.image_embeddings,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=0.0,
|
||||
output_type="pil",
|
||||
num_inference_steps=10
|
||||
).images[0]
|
||||
decoder_output.save("cascade.png")
|
||||
```
|
||||
|
||||
## Loading original checkpoints with `from_single_file`
|
||||
|
||||
Loading the original format checkpoints is supported via `from_single_file` method in the StableCascadeUNet.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import (
|
||||
StableCascadeDecoderPipeline,
|
||||
StableCascadePriorPipeline,
|
||||
StableCascadeUNet,
|
||||
)
|
||||
|
||||
prompt = "an image of a shiba inu, donning a spacesuit and helmet"
|
||||
negative_prompt = ""
|
||||
|
||||
prior_unet = StableCascadeUNet.from_single_file(
|
||||
"https://huggingface.co/stabilityai/stable-cascade/resolve/main/stage_c_bf16.safetensors",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
decoder_unet = StableCascadeUNet.from_single_file(
|
||||
"https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_bf16.safetensors",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", prior=prior_unet, torch_dtype=torch.bfloat16)
|
||||
decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", decoder=decoder_unet, torch_dtype=torch.bfloat16)
|
||||
|
||||
prior.enable_model_cpu_offload()
|
||||
prior_output = prior(
|
||||
prompt=prompt,
|
||||
height=1024,
|
||||
width=1024,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=4.0,
|
||||
num_images_per_prompt=1,
|
||||
num_inference_steps=20
|
||||
)
|
||||
|
||||
decoder.enable_model_cpu_offload()
|
||||
decoder_output = decoder(
|
||||
image_embeddings=prior_output.image_embeddings,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=0.0,
|
||||
output_type="pil",
|
||||
num_inference_steps=10
|
||||
).images[0]
|
||||
decoder_output.save("cascade-single-file.png")
|
||||
```
|
||||
|
||||
## Uses
|
||||
|
||||
### Direct Use
|
||||
|
||||
The model is intended for research purposes for now. Possible research areas and tasks include
|
||||
|
||||
- Research on generative models.
|
||||
- Safe deployment of models which have the potential to generate harmful content.
|
||||
- Probing and understanding the limitations and biases of generative models.
|
||||
- Generation of artworks and use in design and other artistic processes.
|
||||
- Applications in educational or creative tools.
|
||||
|
||||
Excluded uses are described below.
|
||||
|
||||
### Out-of-Scope Use
|
||||
|
||||
The model was not trained to be factual or true representations of people or events,
|
||||
and therefore using the model to generate such content is out-of-scope for the abilities of this model.
|
||||
The model should not be used in any way that violates Stability AI's [Acceptable Use Policy](https://stability.ai/use-policy).
|
||||
|
||||
## Limitations and Bias
|
||||
|
||||
### Limitations
|
||||
- Faces and people in general may not be generated properly.
|
||||
- The autoencoding part of the model is lossy.
|
||||
|
||||
|
||||
## StableCascadeCombinedPipeline
|
||||
|
||||
[[autodoc]] StableCascadeCombinedPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableCascadePriorPipeline
|
||||
|
||||
[[autodoc]] StableCascadePriorPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## StableCascadePriorPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.stable_cascade.pipeline_stable_cascade_prior.StableCascadePriorPipelineOutput
|
||||
|
||||
## StableCascadeDecoderPipeline
|
||||
|
||||
[[autodoc]] StableCascadeDecoderPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
@@ -172,41 +172,3 @@ inpaint = StableDiffusionInpaintPipeline(**text2img.components)
|
||||
|
||||
# now you can use text2img(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
|
||||
```
|
||||
|
||||
### Create web demos using `gradio`
|
||||
|
||||
The Stable Diffusion pipelines are automatically supported in [Gradio](https://github.com/gradio-app/gradio/), a library that makes creating beautiful and user-friendly machine learning apps on the web a breeze. First, make sure you have Gradio installed:
|
||||
|
||||
```
|
||||
pip install -U gradio
|
||||
```
|
||||
|
||||
Then, create a web demo around any Stable Diffusion-based pipeline. For example, you can create an image generation pipeline in a single line of code with Gradio's [`Interface.from_pipeline`](https://www.gradio.app/docs/interface#interface-from-pipeline) function:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import gradio as gr
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
|
||||
gr.Interface.from_pipeline(pipe).launch()
|
||||
```
|
||||
|
||||
which opens an intuitive drag-and-drop interface in your browser:
|
||||
|
||||

|
||||
|
||||
Similarly, you could create a demo for an image-to-image pipeline with:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionImg2ImgPipeline
|
||||
import gradio as gr
|
||||
|
||||
|
||||
pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
|
||||
gr.Interface.from_pipeline(pipe).launch()
|
||||
```
|
||||
|
||||
By default, the web demo runs on a local server. If you'd like to share it with others, you can generate a temporary public
|
||||
link by setting `share=True` in `launch()`. Or, you can host your demo on [Hugging Face Spaces](https://huggingface.co/spaces)https://huggingface.co/spaces for a permanent link.
|
||||
@@ -21,7 +21,7 @@ The abstract from the paper is:
|
||||
## Tips
|
||||
|
||||
- SDXL Turbo uses the exact same architecture as [SDXL](./stable_diffusion_xl), which means it also has the same API. Please refer to the [SDXL](./stable_diffusion_xl) API reference for more details.
|
||||
- SDXL Turbo should disable guidance scale by setting `guidance_scale=0.0`.
|
||||
- SDXL Turbo should disable guidance scale by setting `guidance_scale=0.0`
|
||||
- SDXL Turbo should use `timestep_spacing='trailing'` for the scheduler and use between 1 and 4 steps.
|
||||
- SDXL Turbo has been trained to generate images of size 512x512.
|
||||
- SDXL Turbo is open-access, but not open-source meaning that one might have to buy a model license in order to use it for commercial applications. Make sure to read the [official model card](https://huggingface.co/stabilityai/sdxl-turbo) to learn more.
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Stable Video Diffusion
|
||||
|
||||
Stable Video Diffusion was proposed in [Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets](https://hf.co/papers/2311.15127) by Andreas Blattmann, Tim Dockhorn, Sumith Kulal, Daniel Mendelevitch, Maciej Kilian, Dominik Lorenz, Yam Levi, Zion English, Vikram Voleti, Adam Letts, Varun Jampani, Robin Rombach.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*We present Stable Video Diffusion - a latent video diffusion model for high-resolution, state-of-the-art text-to-video and image-to-video generation. Recently, latent diffusion models trained for 2D image synthesis have been turned into generative video models by inserting temporal layers and finetuning them on small, high-quality video datasets. However, training methods in the literature vary widely, and the field has yet to agree on a unified strategy for curating video data. In this paper, we identify and evaluate three different stages for successful training of video LDMs: text-to-image pretraining, video pretraining, and high-quality video finetuning. Furthermore, we demonstrate the necessity of a well-curated pretraining dataset for generating high-quality videos and present a systematic curation process to train a strong base model, including captioning and filtering strategies. We then explore the impact of finetuning our base model on high-quality data and train a text-to-video model that is competitive with closed-source video generation. We also show that our base model provides a powerful motion representation for downstream tasks such as image-to-video generation and adaptability to camera motion-specific LoRA modules. Finally, we demonstrate that our model provides a strong multi-view 3D-prior and can serve as a base to finetune a multi-view diffusion model that jointly generates multiple views of objects in a feedforward fashion, outperforming image-based methods at a fraction of their compute budget. We release code and model weights at this https URL.*
|
||||
|
||||
<Tip>
|
||||
|
||||
To learn how to use Stable Video Diffusion, take a look at the [Stable Video Diffusion](../../../using-diffusers/svd) guide.
|
||||
|
||||
<br>
|
||||
|
||||
Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the [base](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [extended frame](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) checkpoints!
|
||||
|
||||
</Tip>
|
||||
|
||||
## Tips
|
||||
|
||||
Video generation is memory-intensive and one way to reduce your memory usage is to set `enable_forward_chunking` on the pipeline's UNet so you don't run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.
|
||||
|
||||
Check out the [Text or image-to-video](text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
|
||||
|
||||
## StableVideoDiffusionPipeline
|
||||
|
||||
[[autodoc]] StableVideoDiffusionPipeline
|
||||
|
||||
## StableVideoDiffusionPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput
|
||||
@@ -167,12 +167,6 @@ Here are some sample outputs:
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## Tips
|
||||
|
||||
Video generation is memory-intensive and one way to reduce your memory usage is to set `enable_forward_chunking` on the pipeline's UNet so you don't run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.
|
||||
|
||||
Check out the [Text or image-to-video](text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
|
||||
|
||||
<Tip>
|
||||
|
||||
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
|
||||
|
||||
@@ -1,21 +1,9 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# ConsistencyDecoderScheduler
|
||||
|
||||
This scheduler is a part of the [`ConsistencyDecoderPipeline`] and was introduced in [DALL-E 3](https://openai.com/dall-e-3).
|
||||
This scheduler is a part of the [`ConsistencyDecoderPipeline`] and was introduced in [DALL-E 3](https://openai.com/dall-e-3).
|
||||
|
||||
The original codebase can be found at [openai/consistency_models](https://github.com/openai/consistency_models).
|
||||
|
||||
|
||||
## ConsistencyDecoderScheduler
|
||||
[[autodoc]] schedulers.scheduling_consistency_decoder.ConsistencyDecoderScheduler
|
||||
[[autodoc]] schedulers.scheduling_consistency_decoder.ConsistencyDecoderScheduler
|
||||
@@ -1,22 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# EDMEulerScheduler
|
||||
|
||||
The Karras formulation of the Euler scheduler (Algorithm 2) from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper by Karras et al. This is a fast scheduler which can often generate good outputs in 20-30 steps. The scheduler is based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by [Katherine Crowson](https://github.com/crowsonkb/).
|
||||
|
||||
|
||||
## EDMEulerScheduler
|
||||
[[autodoc]] EDMEulerScheduler
|
||||
|
||||
## EDMEulerSchedulerOutput
|
||||
[[autodoc]] schedulers.scheduling_edm_euler.EDMEulerSchedulerOutput
|
||||
@@ -1,24 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# EDMDPMSolverMultistepScheduler
|
||||
|
||||
`EDMDPMSolverMultistepScheduler` is a [Karras formulation](https://huggingface.co/papers/2206.00364) of `DPMSolverMultistep`, a multistep scheduler from [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) and [DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models](https://huggingface.co/papers/2211.01095) by Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu.
|
||||
|
||||
DPMSolver (and the improved version DPMSolver++) is a fast dedicated high-order solver for diffusion ODEs with convergence order guarantee. Empirically, DPMSolver sampling with only 20 steps can generate high-quality
|
||||
samples, and it can generate quite good samples even in 10 steps.
|
||||
|
||||
## EDMDPMSolverMultistepScheduler
|
||||
[[autodoc]] EDMDPMSolverMultistepScheduler
|
||||
|
||||
## SchedulerOutput
|
||||
[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
|
||||
@@ -1,29 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# TCDScheduler
|
||||
|
||||
[Trajectory Consistency Distillation](https://huggingface.co/papers/2402.19159) by Jianbin Zheng, Minghui Hu, Zhongyi Fan, Chaoyue Wang, Changxing Ding, Dacheng Tao and Tat-Jen Cham introduced a Strategic Stochastic Sampling (Algorithm 4) that is capable of generating good samples in a small number of steps. Distinguishing it as an advanced iteration of the multistep scheduler (Algorithm 1) in the [Consistency Models](https://huggingface.co/papers/2303.01469), Strategic Stochastic Sampling specifically tailored for the trajectory consistency function.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
*Latent Consistency Model (LCM) extends the Consistency Model to the latent space and leverages the guided consistency distillation technique to achieve impressive performance in accelerating text-to-image synthesis. However, we observed that LCM struggles to generate images with both clarity and detailed intricacy. To address this limitation, we initially delve into and elucidate the underlying causes. Our investigation identifies that the primary issue stems from errors in three distinct areas. Consequently, we introduce Trajectory Consistency Distillation (TCD), which encompasses trajectory consistency function and strategic stochastic sampling. The trajectory consistency function diminishes the distillation errors by broadening the scope of the self-consistency boundary condition and endowing the TCD with the ability to accurately trace the entire trajectory of the Probability Flow ODE. Additionally, strategic stochastic sampling is specifically designed to circumvent the accumulated errors inherent in multi-step consistency sampling, which is meticulously tailored to complement the TCD model. Experiments demonstrate that TCD not only significantly enhances image quality at low NFEs but also yields more detailed results compared to the teacher model at high NFEs.*
|
||||
|
||||
The original codebase can be found at [jabir-zheng/TCD](https://github.com/jabir-zheng/TCD).
|
||||
|
||||
## TCDScheduler
|
||||
[[autodoc]] TCDScheduler
|
||||
|
||||
|
||||
## TCDSchedulerOutput
|
||||
[[autodoc]] schedulers.scheduling_tcd.TCDSchedulerOutput
|
||||
|
||||
@@ -66,9 +66,3 @@ image = pipe(prompt).images[0]
|
||||
Don't use [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Distilled model
|
||||
|
||||
You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
|
||||
|
||||
Learn more about in the [Distilled Stable Diffusion inference](../using-diffusers/distilled_sd) guide!
|
||||
|
||||
@@ -75,9 +75,6 @@ Compilation requires some time to complete, so it is best suited for situations
|
||||
|
||||
For more information and different options about `torch.compile`, refer to the [`torch_compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) tutorial.
|
||||
|
||||
> [!TIP]
|
||||
> Learn more about other ways PyTorch 2.0 can help optimize your model in the [Accelerate inference of text-to-image diffusion models](../tutorials/fast_diffusion) tutorial.
|
||||
|
||||
## Benchmark
|
||||
|
||||
We conducted a comprehensive benchmark with PyTorch 2.0's efficient attention implementation and `torch.compile` across different GPUs and batch sizes for five of our most used pipelines. The code is benchmarked on 🤗 Diffusers v0.17.0.dev0 to optimize `torch.compile` usage (see [here](https://github.com/huggingface/diffusers/pull/3313) for more details).
|
||||
|
||||
@@ -77,7 +77,7 @@ accelerate config default
|
||||
|
||||
Or if your environment doesn't support an interactive shell, like a notebook, you can use:
|
||||
|
||||
```py
|
||||
```bash
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
write_basic_config()
|
||||
@@ -113,50 +113,36 @@ The dataset preprocessing code and training loop are found in the [`main()`](htt
|
||||
|
||||
As with the script parameters, a walkthrough of the training script is provided in the [Text-to-image](text2image#training-script) training guide. Instead, this guide takes a look at the LoRA relevant parts of the script.
|
||||
|
||||
<hfoptions id="lora">
|
||||
<hfoption id="UNet">
|
||||
|
||||
Diffusers uses [`~peft.LoraConfig`] from the [PEFT](https://hf.co/docs/peft) library to set up the parameters of the LoRA adapter such as the rank, alpha, and which modules to insert the LoRA weights into. The adapter is added to the UNet, and only the LoRA layers are filtered for optimization in `lora_layers`.
|
||||
The script begins by adding the [new LoRA weights](https://github.com/huggingface/diffusers/blob/dd9a5caf61f04d11c0fa9f3947b69ab0010c9a0f/examples/text_to_image/train_text_to_image_lora.py#L447) to the attention layers. This involves correctly configuring the weight size for each block in the UNet. You'll see the `rank` parameter is used to create the [`~models.attention_processor.LoRAAttnProcessor`]:
|
||||
|
||||
```py
|
||||
unet_lora_config = LoraConfig(
|
||||
r=args.rank,
|
||||
lora_alpha=args.rank,
|
||||
init_lora_weights="gaussian",
|
||||
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
|
||||
)
|
||||
lora_attn_procs = {}
|
||||
for name in unet.attn_processors.keys():
|
||||
cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
|
||||
if name.startswith("mid_block"):
|
||||
hidden_size = unet.config.block_out_channels[-1]
|
||||
elif name.startswith("up_blocks"):
|
||||
block_id = int(name[len("up_blocks.")])
|
||||
hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
|
||||
elif name.startswith("down_blocks"):
|
||||
block_id = int(name[len("down_blocks.")])
|
||||
hidden_size = unet.config.block_out_channels[block_id]
|
||||
|
||||
unet.add_adapter(unet_lora_config)
|
||||
lora_layers = filter(lambda p: p.requires_grad, unet.parameters())
|
||||
lora_attn_procs[name] = LoRAAttnProcessor(
|
||||
hidden_size=hidden_size,
|
||||
cross_attention_dim=cross_attention_dim,
|
||||
rank=args.rank,
|
||||
)
|
||||
|
||||
unet.set_attn_processor(lora_attn_procs)
|
||||
lora_layers = AttnProcsLayers(unet.attn_processors)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="text encoder">
|
||||
|
||||
Diffusers also supports finetuning the text encoder with LoRA from the [PEFT](https://hf.co/docs/peft) library when necessary such as finetuning Stable Diffusion XL (SDXL). The [`~peft.LoraConfig`] is used to configure the parameters of the LoRA adapter which are then added to the text encoder, and only the LoRA layers are filtered for training.
|
||||
|
||||
```py
|
||||
text_lora_config = LoraConfig(
|
||||
r=args.rank,
|
||||
lora_alpha=args.rank,
|
||||
init_lora_weights="gaussian",
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
|
||||
)
|
||||
|
||||
text_encoder_one.add_adapter(text_lora_config)
|
||||
text_encoder_two.add_adapter(text_lora_config)
|
||||
text_lora_parameters_one = list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
|
||||
text_lora_parameters_two = list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
The [optimizer](https://github.com/huggingface/diffusers/blob/e4b8f173b97731686e290b2eb98e7f5df2b1b322/examples/text_to_image/train_text_to_image_lora.py#L529) is initialized with the `lora_layers` because these are the only weights that'll be optimized:
|
||||
The [optimizer](https://github.com/huggingface/diffusers/blob/dd9a5caf61f04d11c0fa9f3947b69ab0010c9a0f/examples/text_to_image/train_text_to_image_lora.py#L519) is initialized with the `lora_layers` because these are the only weights that'll be optimized:
|
||||
|
||||
```py
|
||||
optimizer = optimizer_cls(
|
||||
lora_layers,
|
||||
lora_layers.parameters(),
|
||||
lr=args.learning_rate,
|
||||
betas=(args.adam_beta1, args.adam_beta2),
|
||||
weight_decay=args.adam_weight_decay,
|
||||
@@ -170,7 +156,7 @@ Aside from setting up the LoRA layers, the training script is more or less the s
|
||||
|
||||
Once you've made all your changes or you're okay with the default configuration, you're ready to launch the training script! 🚀
|
||||
|
||||
Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate our own Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:
|
||||
Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate our yown Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:
|
||||
|
||||
- saved model checkpoints
|
||||
- `pytorch_lora_weights.safetensors` (the trained LoRA weights)
|
||||
|
||||
@@ -14,17 +14,19 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Load LoRAs for inference
|
||||
|
||||
There are many adapter types (with [LoRAs](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) being the most popular) trained in different styles to achieve different effects. You can even combine multiple adapters to create new and unique images.
|
||||
There are many adapters (with LoRAs being the most common type) trained in different styles to achieve different effects. You can even combine multiple adapters to create new and unique images. With the 🤗 [PEFT](https://huggingface.co/docs/peft/index) integration in 🤗 Diffusers, it is really easy to load and manage adapters for inference. In this guide, you'll learn how to use different adapters with [Stable Diffusion XL (SDXL)](../api/pipelines/stable_diffusion/stable_diffusion_xl) for inference.
|
||||
|
||||
In this tutorial, you'll learn how to easily load and manage adapters for inference with the 🤗 [PEFT](https://huggingface.co/docs/peft/index) integration in 🤗 Diffusers. You'll use LoRA as the main adapter technique, so you'll see the terms LoRA and adapter used interchangeably.
|
||||
Throughout this guide, you'll use LoRA as the main adapter technique, so we'll use the terms LoRA and adapter interchangeably. You should have some familiarity with LoRA, and if you don't, we welcome you to check out the [LoRA guide](https://huggingface.co/docs/peft/conceptual_guides/lora).
|
||||
|
||||
Let's first install all the required libraries.
|
||||
|
||||
```bash
|
||||
!pip install -q transformers accelerate peft diffusers
|
||||
!pip install -q transformers accelerate
|
||||
!pip install peft
|
||||
!pip install diffusers
|
||||
```
|
||||
|
||||
Now, load a pipeline with a [Stable Diffusion XL (SDXL)](../api/pipelines/stable_diffusion/stable_diffusion_xl) checkpoint:
|
||||
Now, let's load a pipeline with a SDXL checkpoint:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -34,18 +36,21 @@ pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
pipe = DiffusionPipeline.from_pretrained(pipe_id, torch_dtype=torch.float16).to("cuda")
|
||||
```
|
||||
|
||||
Next, load a [CiroN2022/toy-face](https://huggingface.co/CiroN2022/toy-face) adapter with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method. With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which let's you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.
|
||||
|
||||
Next, load a LoRA checkpoint with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method.
|
||||
|
||||
With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which let's you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.
|
||||
|
||||
```python
|
||||
pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
|
||||
```
|
||||
|
||||
Make sure to include the token `toy_face` in the prompt and then you can perform inference:
|
||||
And then perform inference:
|
||||
|
||||
```python
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
|
||||
lora_scale = 0.9
|
||||
lora_scale= 0.9
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
|
||||
).images[0]
|
||||
@@ -54,16 +59,17 @@ image
|
||||
|
||||

|
||||
|
||||
With the `adapter_name` parameter, it is really easy to use another adapter for inference! Load the [nerijs/pixel-art-xl](https://huggingface.co/nerijs/pixel-art-xl) adapter that has been fine-tuned to generate pixel art images and call it `"pixel"`.
|
||||
|
||||
The pipeline automatically sets the first loaded adapter (`"toy"`) as the active adapter, but you can activate the `"pixel"` adapter with the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method:
|
||||
With the `adapter_name` parameter, it is really easy to use another adapter for inference! Load the [nerijs/pixel-art-xl](https://huggingface.co/nerijs/pixel-art-xl) adapter that has been fine-tuned to generate pixel art images, and let's call it `"pixel"`.
|
||||
|
||||
The pipeline automatically sets the first loaded adapter (`"toy"`) as the active adapter. But you can activate the `"pixel"` adapter with the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method as shown below:
|
||||
|
||||
```python
|
||||
pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
|
||||
pipe.set_adapters("pixel")
|
||||
```
|
||||
|
||||
Make sure you include the token `pixel art` in your prompt to generate a pixel art image:
|
||||
Let's now generate an image with the second adapter and check the result:
|
||||
|
||||
```python
|
||||
prompt = "a hacker with a hoodie, pixel art"
|
||||
@@ -75,25 +81,29 @@ image
|
||||
|
||||

|
||||
|
||||
## Merge adapters
|
||||
## Combine multiple adapters
|
||||
|
||||
You can also merge different adapter checkpoints for inference to blend their styles together.
|
||||
You can also perform multi-adapter inference where you combine different adapter checkpoints for inference.
|
||||
|
||||
Once again, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method to activate the `pixel` and `toy` adapters and specify the weights for how they should be merged.
|
||||
Once again, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method to activate two LoRA checkpoints and specify the weight for how the checkpoints should be combined.
|
||||
|
||||
```python
|
||||
pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
|
||||
```
|
||||
|
||||
Now that we have set these two adapters, let's generate an image from the combined adapters!
|
||||
|
||||
<Tip>
|
||||
|
||||
LoRA checkpoints in the diffusion community are almost always obtained with [DreamBooth](https://huggingface.co/docs/diffusers/main/en/training/dreambooth). DreamBooth training often relies on "trigger" words in the input text prompts in order for the generation results to look as expected. When you combine multiple LoRA checkpoints, it's important to ensure the trigger words for the corresponding LoRA checkpoints are present in the input text prompts.
|
||||
|
||||
</Tip>
|
||||
|
||||
Remember to use the trigger words for [CiroN2022/toy-face](https://hf.co/CiroN2022/toy-face) and [nerijs/pixel-art-xl](https://hf.co/nerijs/pixel-art-xl) (these are found in their repositories) in the prompt to generate an image.
|
||||
The trigger words for [CiroN2022/toy-face](https://hf.co/CiroN2022/toy-face) and [nerijs/pixel-art-xl](https://hf.co/nerijs/pixel-art-xl) are found in their repositories.
|
||||
|
||||
|
||||
```python
|
||||
# Notice how the prompt is constructed.
|
||||
prompt = "toy_face of a hacker with a hoodie, pixel art"
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": 1.0}, generator=torch.manual_seed(0)
|
||||
@@ -103,39 +113,43 @@ image
|
||||
|
||||

|
||||
|
||||
Impressive! As you can see, the model generated an image that mixed the characteristics of both adapters.
|
||||
Impressive! As you can see, the model was able to generate an image that mixes the characteristics of both adapters.
|
||||
|
||||
> [!TIP]
|
||||
> Through its PEFT integration, Diffusers also offers more efficient merging methods which you can learn about in the [Merge LoRAs](../using-diffusers/merge_loras) guide!
|
||||
|
||||
To return to only using one adapter, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method to activate the `"toy"` adapter:
|
||||
If you want to go back to using only one adapter, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method to activate the `"toy"` adapter:
|
||||
|
||||
```python
|
||||
# First, set the adapter.
|
||||
pipe.set_adapters("toy")
|
||||
|
||||
# Then, run inference.
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
lora_scale = 0.9
|
||||
lora_scale= 0.9
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
Or to disable all adapters entirely, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.disable_lora`] method to return the base model.
|
||||

|
||||
|
||||
|
||||
If you want to switch to only the base model, disable all LoRAs with the [`~diffusers.loaders.UNet2DConditionLoadersMixin.disable_lora`] method.
|
||||
|
||||
|
||||
```python
|
||||
pipe.disable_lora()
|
||||
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
lora_scale= 0.9
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Manage active adapters
|
||||
## Monitoring active adapters
|
||||
|
||||
You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.LoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:
|
||||
You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, you can easily check the list of active adapters using the [`~diffusers.loaders.LoraLoaderMixin.get_active_adapters`] method:
|
||||
|
||||
```py
|
||||
active_adapters = pipe.get_active_adapters()
|
||||
@@ -150,3 +164,55 @@ list_adapters_component_wise = pipe.get_list_adapters()
|
||||
list_adapters_component_wise
|
||||
{"text_encoder": ["toy", "pixel"], "unet": ["toy", "pixel"], "text_encoder_2": ["toy", "pixel"]}
|
||||
```
|
||||
|
||||
## Fusing adapters into the model
|
||||
|
||||
You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~diffusers.loaders.LoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.
|
||||
|
||||
```py
|
||||
pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
|
||||
pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
|
||||
|
||||
pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
|
||||
# Fuses the LoRAs into the Unet
|
||||
pipe.fuse_lora()
|
||||
|
||||
prompt = "toy_face of a hacker with a hoodie, pixel art"
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
|
||||
# Gets the Unet back to the original state
|
||||
pipe.unfuse_lora()
|
||||
```
|
||||
|
||||
You can also fuse some adapters using `adapter_names` for faster generation:
|
||||
|
||||
```py
|
||||
pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
|
||||
pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
|
||||
|
||||
pipe.set_adapters(["pixel"], adapter_weights=[0.5, 1.0])
|
||||
# Fuses the LoRAs into the Unet
|
||||
pipe.fuse_lora(adapter_names=["pixel"])
|
||||
|
||||
prompt = "a hacker with a hoodie, pixel art"
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
|
||||
# Gets the Unet back to the original state
|
||||
pipe.unfuse_lora()
|
||||
|
||||
# Fuse all adapters
|
||||
pipe.fuse_lora(adapter_names=["pixel", "toy"])
|
||||
|
||||
prompt = "toy_face of a hacker with a hoodie, pixel art"
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
```
|
||||
|
||||
## Saving a pipeline after fusing the adapters
|
||||
|
||||
To properly save a pipeline after it's been loaded with the adapters, it should be serialized like so:
|
||||
|
||||
```python
|
||||
pipe.fuse_lora(lora_scale=1.0)
|
||||
pipe.unload_lora_weights()
|
||||
pipe.save_pretrained("path-to-pipeline")
|
||||
```
|
||||
|
||||
@@ -12,18 +12,13 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Pipeline callbacks
|
||||
|
||||
The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. The callback function is executed at the end of each step, and modifies the pipeline attributes and variables for the next step. This is really useful for *dynamically* adjusting certain pipeline attributes or modifying tensor variables. This versatility allows for interesting use-cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale. With callbacks, you can implement new features without modifying the underlying code!
|
||||
The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. This can be really useful for *dynamically* adjusting certain pipeline attributes, or modifying tensor variables. The flexibility of callbacks opens up some interesting use-cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale.
|
||||
|
||||
> [!TIP]
|
||||
> 🤗 Diffusers currently only supports `callback_on_step_end`, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require a callback function with a different execution point!
|
||||
This guide will show you how to use the `callback_on_step_end` parameter to disable classifier-free guidance (CFG) after 40% of the inference steps to save compute with minimal cost to performance.
|
||||
|
||||
This guide will demonstrate how callbacks work by a few features you can implement with them.
|
||||
The callback function should have the following arguments:
|
||||
|
||||
## Dynamic classifier-free guidance
|
||||
|
||||
Dynamic classifier-free guidance (CFG) is a feature that allows you to disable CFG after a certain number of inference steps which can help you save compute with minimal cost to performance. The callback function for this should have the following arguments:
|
||||
|
||||
* `pipeline` (or the pipeline instance) provides access to important properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipeline._guidance_scale=0.0`.
|
||||
* `pipe` (or the pipeline instance) provides access to useful properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipe._guidance_scale=0.0`.
|
||||
* `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`.
|
||||
* `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly.
|
||||
|
||||
@@ -32,13 +27,13 @@ Your callback function should look something like this:
|
||||
```python
|
||||
def callback_dynamic_cfg(pipe, step_index, timestep, callback_kwargs):
|
||||
# adjust the batch_size of prompt_embeds according to guidance_scale
|
||||
if step_index == int(pipeline.num_timesteps * 0.4):
|
||||
if step_index == int(pipe.num_timesteps * 0.4):
|
||||
prompt_embeds = callback_kwargs["prompt_embeds"]
|
||||
prompt_embeds = prompt_embeds.chunk(2)[-1]
|
||||
|
||||
# update guidance_scale and prompt_embeds
|
||||
pipeline._guidance_scale = 0.0
|
||||
callback_kwargs["prompt_embeds"] = prompt_embeds
|
||||
# update guidance_scale and prompt_embeds
|
||||
pipe._guidance_scale = 0.0
|
||||
callback_kwargs["prompt_embeds"] = prompt_embeds
|
||||
return callback_kwargs
|
||||
```
|
||||
|
||||
@@ -48,134 +43,58 @@ Now, you can pass the callback function to the `callback_on_step_end` parameter
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
pipeline = pipeline.to("cuda")
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
|
||||
generator = torch.Generator(device="cuda").manual_seed(1)
|
||||
out = pipeline(
|
||||
prompt,
|
||||
generator=generator,
|
||||
callback_on_step_end=callback_dynamic_cfg,
|
||||
callback_on_step_end_tensor_inputs=['prompt_embeds']
|
||||
)
|
||||
out = pipe(prompt, generator=generator, callback_on_step_end=callback_dynamic_cfg, callback_on_step_end_tensor_inputs=['prompt_embeds'])
|
||||
|
||||
out.images[0].save("out_custom_cfg.png")
|
||||
```
|
||||
|
||||
The callback function is executed at the end of each denoising step, and modifies the pipeline attributes and tensor variables for the next denoising step.
|
||||
|
||||
With callbacks, you can implement features such as dynamic CFG without having to modify the underlying code at all!
|
||||
|
||||
<Tip>
|
||||
|
||||
🤗 Diffusers currently only supports `callback_on_step_end`, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require a callback function with a different execution point!
|
||||
|
||||
</Tip>
|
||||
|
||||
## Interrupt the diffusion process
|
||||
|
||||
> [!TIP]
|
||||
> The interruption callback is supported for text-to-image, image-to-image, and inpainting for the [StableDiffusionPipeline](../api/pipelines/stable_diffusion/overview) and [StableDiffusionXLPipeline](../api/pipelines/stable_diffusion/stable_diffusion_xl).
|
||||
Interrupting the diffusion process is particularly useful when building UIs that work with Diffusers because it allows users to stop the generation process if they're unhappy with the intermediate results. You can incorporate this into your pipeline with a callback.
|
||||
|
||||
Stopping the diffusion process early is useful when building UIs that work with Diffusers because it allows users to stop the generation process if they're unhappy with the intermediate results. You can incorporate this into your pipeline with a callback.
|
||||
<Tip>
|
||||
|
||||
This callback function should take the following arguments: `pipeline`, `i`, `t`, and `callback_kwargs` (this must be returned). Set the pipeline's `_interrupt` attribute to `True` to stop the diffusion process after a certain number of steps. You are also free to implement your own custom stopping logic inside the callback.
|
||||
The interruption callback is supported for text-to-image, image-to-image, and inpainting for the [StableDiffusionPipeline](../api/pipelines/stable_diffusion/overview) and [StableDiffusionXLPipeline](../api/pipelines/stable_diffusion/stable_diffusion_xl).
|
||||
|
||||
</Tip>
|
||||
|
||||
This callback function should take the following arguments: `pipe`, `i`, `t`, and `callback_kwargs` (this must be returned). Set the pipeline's `_interrupt` attribute to `True` to stop the diffusion process after a certain number of steps. You are also free to implement your own custom stopping logic inside the callback.
|
||||
|
||||
In this example, the diffusion process is stopped after 10 steps even though `num_inference_steps` is set to 50.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
pipe.enable_model_cpu_offload()
|
||||
num_inference_steps = 50
|
||||
|
||||
def interrupt_callback(pipeline, i, t, callback_kwargs):
|
||||
def interrupt_callback(pipe, i, t, callback_kwargs):
|
||||
stop_idx = 10
|
||||
if i == stop_idx:
|
||||
pipeline._interrupt = True
|
||||
pipe._interrupt = True
|
||||
|
||||
return callback_kwargs
|
||||
|
||||
pipeline(
|
||||
pipe(
|
||||
"A photo of a cat",
|
||||
num_inference_steps=num_inference_steps,
|
||||
callback_on_step_end=interrupt_callback,
|
||||
)
|
||||
```
|
||||
|
||||
## Display image after each generation step
|
||||
|
||||
> [!TIP]
|
||||
> This tip was contributed by [asomoza](https://github.com/asomoza).
|
||||
|
||||
Display an image after each generation step by accessing and converting the latents after each step into an image. The latent space is compressed to 128x128, so the images are also 128x128 which is useful for a quick preview.
|
||||
|
||||
1. Use the function below to convert the SDXL latents (4 channels) to RGB tensors (3 channels) as explained in the [Explaining the SDXL latent space](https://huggingface.co/blog/TimothyAlexisVass/explaining-the-sdxl-latent-space) blog post.
|
||||
|
||||
```py
|
||||
def latents_to_rgb(latents):
|
||||
weights = (
|
||||
(60, -60, 25, -70),
|
||||
(60, -5, 15, -50),
|
||||
(60, 10, -5, -35)
|
||||
)
|
||||
|
||||
weights_tensor = torch.t(torch.tensor(weights, dtype=latents.dtype).to(latents.device))
|
||||
biases_tensor = torch.tensor((150, 140, 130), dtype=latents.dtype).to(latents.device)
|
||||
rgb_tensor = torch.einsum("...lxy,lr -> ...rxy", latents, weights_tensor) + biases_tensor.unsqueeze(-1).unsqueeze(-1)
|
||||
image_array = rgb_tensor.clamp(0, 255)[0].byte().cpu().numpy()
|
||||
image_array = image_array.transpose(1, 2, 0)
|
||||
|
||||
return Image.fromarray(image_array)
|
||||
```
|
||||
|
||||
2. Create a function to decode and save the latents into an image.
|
||||
|
||||
```py
|
||||
def decode_tensors(pipe, step, timestep, callback_kwargs):
|
||||
latents = callback_kwargs["latents"]
|
||||
|
||||
image = latents_to_rgb(latents)
|
||||
image.save(f"{step}.png")
|
||||
|
||||
return callback_kwargs
|
||||
```
|
||||
|
||||
3. Pass the `decode_tensors` function to the `callback_on_step_end` parameter to decode the tensors after each step. You also need to specify what you want to modify in the `callback_on_step_end_tensor_inputs` parameter, which in this case are the latents.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True
|
||||
).to("cuda")
|
||||
|
||||
image = pipe(
|
||||
prompt = "A croissant shaped like a cute bear."
|
||||
negative_prompt = "Deformed, ugly, bad anatomy"
|
||||
callback_on_step_end=decode_tensors,
|
||||
callback_on_step_end_tensor_inputs=["latents"],
|
||||
).images[0]
|
||||
```
|
||||
|
||||
<div class="flex gap-4 justify-center">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_0.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 0</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_19.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 19
|
||||
</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_29.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 29</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_39.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 39</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_49.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 49</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -429,27 +429,6 @@ image = pipe(
|
||||
make_image_grid([original_image, canny_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
You can use a refiner model with `StableDiffusionXLControlNetPipeline` to improve image quality, just like you can with a regular `StableDiffusionXLPipeline`.
|
||||
See the [Refine image quality](./sdxl#refine-image-quality) section to learn how to use the refiner model.
|
||||
Make sure to use `StableDiffusionXLControlNetPipeline` and pass `image` and `controlnet_conditioning_scale`.
|
||||
|
||||
```py
|
||||
base = StableDiffusionXLControlNetPipeline(...)
|
||||
image = base(
|
||||
prompt=prompt,
|
||||
controlnet_conditioning_scale=0.5,
|
||||
image=canny_image,
|
||||
num_inference_steps=40,
|
||||
denoising_end=0.8,
|
||||
output_type="latent",
|
||||
).images
|
||||
# rest exactly as with StableDiffusionXLPipeline
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
## MultiControlNet
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -56,60 +56,6 @@ pipeline = DiffusionPipeline.from_pretrained(
|
||||
)
|
||||
```
|
||||
|
||||
### Load from a local file
|
||||
|
||||
Community pipelines can also be loaded from a local file if you pass a file path instead. The path to the passed directory must contain a `pipeline.py` file that contains the pipeline class in order to successfully load it.
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
custom_pipeline="./path/to/pipeline_directory/",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
use_safetensors=True,
|
||||
)
|
||||
```
|
||||
|
||||
### Load from a specific version
|
||||
|
||||
By default, community pipelines are loaded from the latest stable version of Diffusers. To load a community pipeline from another version, use the `custom_revision` parameter.
|
||||
|
||||
<hfoptions id="version">
|
||||
<hfoption id="main">
|
||||
|
||||
For example, to load from the `main` branch:
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
custom_pipeline="clip_guided_stable_diffusion",
|
||||
custom_revision="main",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
use_safetensors=True,
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="older version">
|
||||
|
||||
For example, to load from a previous version of Diffusers like `v0.25.0`:
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
custom_pipeline="clip_guided_stable_diffusion",
|
||||
custom_revision="v0.25.0",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
use_safetensors=True,
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
For more information about community pipelines, take a look at the [Community pipelines](custom_pipeline_examples) guide for how to use them and if you're interested in adding a community pipeline check out the [How to contribute a community pipeline](contribute_pipeline) guide!
|
||||
|
||||
## Community components
|
||||
@@ -239,7 +185,5 @@ pipeline.to("cuda")
|
||||
prompt = "柴犬、カラフルアート"
|
||||
|
||||
image = pipeline(prompt=prompt).images[0]
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> When using `trust_remote_code=True`, it is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not update the code with some malicious new lines (unless you fully trust the authors of the models).
|
||||
```
|
||||
@@ -128,7 +128,7 @@ seed = 2023
|
||||
# The values come from
|
||||
# https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines
|
||||
pipe.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2)
|
||||
video_frames = pipe(prompt, height=320, width=576, num_frames=30, generator=torch.manual_seed(seed)).frames[0]
|
||||
video_frames = pipe(prompt, height=320, width=576, num_frames=30, generator=torch.manual_seed(seed)).frames
|
||||
export_to_video(video_frames, "astronaut_rides_horse.mp4")
|
||||
```
|
||||
|
||||
|
||||
@@ -1,438 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
# Trajectory Consistency Distillation-LoRA
|
||||
|
||||
Trajectory Consistency Distillation (TCD) enables a model to generate higher quality and more detailed images with fewer steps. Moreover, owing to the effective error mitigation during the distillation process, TCD demonstrates superior performance even under conditions of large inference steps.
|
||||
|
||||
The major advantages of TCD are:
|
||||
|
||||
- Better than Teacher: TCD demonstrates superior generative quality at both small and large inference steps and exceeds the performance of [DPM-Solver++(2S)](../../api/schedulers/multistep_dpm_solver) with Stable Diffusion XL (SDXL). There is no additional discriminator or LPIPS supervision included during TCD training.
|
||||
|
||||
- Flexible Inference Steps: The inference steps for TCD sampling can be freely adjusted without adversely affecting the image quality.
|
||||
|
||||
- Freely change detail level: During inference, the level of detail in the image can be adjusted with a single hyperparameter, *gamma*.
|
||||
|
||||
> [!TIP]
|
||||
> For more technical details of TCD, please refer to the [paper](https://arxiv.org/abs/2402.19159) or official [project page](https://mhh0318.github.io/tcd/)).
|
||||
|
||||
For large models like SDXL, TCD is trained with [LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) to reduce memory usage. This is also useful because you can reuse LoRAs between different finetuned models, as long as they share the same base model, without further training.
|
||||
|
||||
|
||||
|
||||
This guide will show you how to perform inference with TCD-LoRAs for a variety of tasks like text-to-image and inpainting, as well as how you can easily combine TCD-LoRAs with other adapters. Choose one of the supported base model and it's corresponding TCD-LoRA checkpoint from the table below to get started.
|
||||
|
||||
| Base model | TCD-LoRA checkpoint |
|
||||
|-------------------------------------------------------------------------------------------------|----------------------------------------------------------------|
|
||||
| [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) | [TCD-SD15](https://huggingface.co/h1t/TCD-SD15-LoRA) |
|
||||
| [stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) | [TCD-SD21-base](https://huggingface.co/h1t/TCD-SD21-base-LoRA) |
|
||||
| [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) | [TCD-SDXL](https://huggingface.co/h1t/TCD-SDXL-LoRA) |
|
||||
|
||||
|
||||
Make sure you have [PEFT](https://github.com/huggingface/peft) installed for better LoRA support.
|
||||
|
||||
```bash
|
||||
pip install -U peft
|
||||
```
|
||||
|
||||
## General tasks
|
||||
|
||||
In this guide, let's use the [`StableDiffusionXLPipeline`] and the [`TCDScheduler`]. Use the [`~StableDiffusionPipeline.load_lora_weights`] method to load the SDXL-compatible TCD-LoRA weights.
|
||||
|
||||
A few tips to keep in mind for TCD-LoRA inference are to:
|
||||
|
||||
- Keep the `num_inference_steps` between 4 and 50
|
||||
- Set `eta` (used to control stochasticity at each step) between 0 and 1. You should use a higher `eta` when increasing the number of inference steps, but the downside is that a larger `eta` in [`TCDScheduler`] leads to blurrier images. A value of 0.3 is recommended to produce good results.
|
||||
|
||||
<hfoptions id="tasks">
|
||||
<hfoption id="text-to-image">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline, TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
prompt = "Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna."
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
```
|
||||
|
||||

|
||||
|
||||
</hfoption>
|
||||
|
||||
<hfoption id="inpainting">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import AutoPipelineForInpainting, TCDScheduler
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
pipe = AutoPipelineForInpainting.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
|
||||
mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
|
||||
|
||||
init_image = load_image(img_url).resize((1024, 1024))
|
||||
mask_image = load_image(mask_url).resize((1024, 1024))
|
||||
|
||||
prompt = "a tiger sitting on a park bench"
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
num_inference_steps=8,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
strength=0.99, # make sure to use `strength` below 1.0
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
|
||||
grid_image = make_image_grid([init_image, mask_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Community models
|
||||
|
||||
TCD-LoRA also works with many community finetuned models and plugins. For example, load the [animagine-xl-3.0](https://huggingface.co/cagliostrolab/animagine-xl-3.0) checkpoint which is a community finetuned version of SDXL for generating anime images.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline, TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "cagliostrolab/animagine-xl-3.0"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
prompt = "A man, clad in a meticulously tailored military uniform, stands with unwavering resolve. The uniform boasts intricate details, and his eyes gleam with determination. Strands of vibrant, windswept hair peek out from beneath the brim of his cap."
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=8,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
```
|
||||
|
||||

|
||||
|
||||
TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method.
|
||||
|
||||
> [!TIP]
|
||||
> Check out the [Merge LoRAs](merge_loras) guide to learn more about efficient merging methods.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
styled_lora_id = "TheLastBen/Papercut_SDXL"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16, variant="fp16").to(device)
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id, adapter_name="tcd")
|
||||
pipe.load_lora_weights(styled_lora_id, adapter_name="style")
|
||||
pipe.set_adapters(["tcd", "style"], adapter_weights=[1.0, 1.0])
|
||||
|
||||
prompt = "papercut of a winter mountain, snow"
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
## Adapters
|
||||
|
||||
TCD-LoRA is very versatile, and it can be combined with other adapter types like ControlNets, IP-Adapter, and AnimateDiff.
|
||||
|
||||
<hfoptions id="adapters">
|
||||
<hfoption id="ControlNet">
|
||||
|
||||
### Depth ControlNet
|
||||
|
||||
```python
|
||||
import torch
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from transformers import DPTFeatureExtractor, DPTForDepthEstimation
|
||||
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
|
||||
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
|
||||
|
||||
def get_depth_map(image):
|
||||
image = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
|
||||
with torch.no_grad(), torch.autocast(device):
|
||||
depth_map = depth_estimator(image).predicted_depth
|
||||
|
||||
depth_map = torch.nn.functional.interpolate(
|
||||
depth_map.unsqueeze(1),
|
||||
size=(1024, 1024),
|
||||
mode="bicubic",
|
||||
align_corners=False,
|
||||
)
|
||||
depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
|
||||
depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
|
||||
depth_map = (depth_map - depth_min) / (depth_max - depth_min)
|
||||
image = torch.cat([depth_map] * 3, dim=1)
|
||||
|
||||
image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
|
||||
image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
|
||||
return image
|
||||
|
||||
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
controlnet_id = "diffusers/controlnet-depth-sdxl-1.0"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained(
|
||||
controlnet_id,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
base_model_id,
|
||||
controlnet=controlnet,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
prompt = "stormtrooper lecture, photorealistic"
|
||||
|
||||
image = load_image("https://huggingface.co/lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png")
|
||||
depth_image = get_depth_map(image)
|
||||
|
||||
controlnet_conditioning_scale = 0.5 # recommended for good generalization
|
||||
|
||||
image = pipe(
|
||||
prompt,
|
||||
image=depth_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
controlnet_conditioning_scale=controlnet_conditioning_scale,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
|
||||
grid_image = make_image_grid([depth_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||

|
||||
|
||||
### Canny ControlNet
|
||||
```python
|
||||
import torch
|
||||
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
controlnet_id = "diffusers/controlnet-canny-sdxl-1.0"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained(
|
||||
controlnet_id,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
base_model_id,
|
||||
controlnet=controlnet,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to(device)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
prompt = "ultrarealistic shot of a furry blue bird"
|
||||
|
||||
canny_image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png")
|
||||
|
||||
controlnet_conditioning_scale = 0.5 # recommended for good generalization
|
||||
|
||||
image = pipe(
|
||||
prompt,
|
||||
image=canny_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
controlnet_conditioning_scale=controlnet_conditioning_scale,
|
||||
generator=torch.Generator(device=device).manual_seed(0),
|
||||
).images[0]
|
||||
|
||||
grid_image = make_image_grid([canny_image, image], rows=1, cols=2)
|
||||
```
|
||||

|
||||
|
||||
<Tip>
|
||||
The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one.
|
||||
</Tip>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="IP-Adapter">
|
||||
|
||||
This example shows how to use the TCD-LoRA with the [IP-Adapter](https://github.com/tencent-ailab/IP-Adapter/tree/main) and SDXL.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
from ip_adapter import IPAdapterXL
|
||||
from scheduling_tcd import TCDScheduler
|
||||
|
||||
device = "cuda"
|
||||
base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
image_encoder_path = "sdxl_models/image_encoder"
|
||||
ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin"
|
||||
tcd_lora_id = "h1t/TCD-SDXL-LoRA"
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_pretrained(
|
||||
base_model_path,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16"
|
||||
)
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
pipe.load_lora_weights(tcd_lora_id)
|
||||
pipe.fuse_lora()
|
||||
|
||||
ip_model = IPAdapterXL(pipe, image_encoder_path, ip_ckpt, device)
|
||||
|
||||
ref_image = load_image("https://raw.githubusercontent.com/tencent-ailab/IP-Adapter/main/assets/images/woman.png").resize((512, 512))
|
||||
|
||||
prompt = "best quality, high quality, wearing sunglasses"
|
||||
|
||||
image = ip_model.generate(
|
||||
pil_image=ref_image,
|
||||
prompt=prompt,
|
||||
scale=0.5,
|
||||
num_samples=1,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=0,
|
||||
eta=0.3,
|
||||
seed=0,
|
||||
)[0]
|
||||
|
||||
grid_image = make_image_grid([ref_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AnimateDiff">
|
||||
|
||||
[`AnimateDiff`] allows animating images using Stable Diffusion models. TCD-LoRA can substantially accelerate the process without degrading image quality. The quality of animation with TCD-LoRA and AnimateDiff has a more lucid outcome.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
|
||||
from scheduling_tcd import TCDScheduler
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5")
|
||||
pipe = AnimateDiffPipeline.from_pretrained(
|
||||
"frankjoshua/toonyou_beta6",
|
||||
motion_adapter=adapter,
|
||||
).to("cuda")
|
||||
|
||||
# set TCDScheduler
|
||||
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
|
||||
|
||||
# load TCD LoRA
|
||||
pipe.load_lora_weights("h1t/TCD-SD15-LoRA", adapter_name="tcd")
|
||||
pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
|
||||
|
||||
pipe.set_adapters(["tcd", "motion-lora"], adapter_weights=[1.0, 1.2])
|
||||
|
||||
prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
|
||||
generator = torch.manual_seed(0)
|
||||
frames = pipe(
|
||||
prompt=prompt,
|
||||
num_inference_steps=5,
|
||||
guidance_scale=0,
|
||||
cross_attention_kwargs={"scale": 1},
|
||||
num_frames=24,
|
||||
eta=0.3,
|
||||
generator=generator
|
||||
).frames[0]
|
||||
export_to_gif(frames, "animation.gif")
|
||||
```
|
||||
|
||||

|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
@@ -1,594 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# IP-Adapter
|
||||
|
||||
[IP-Adapter](https://hf.co/papers/2308.06721) is an image prompt adapter that can be plugged into diffusion models to enable image prompting without any changes to the underlying model. Furthermore, this adapter can be reused with other models finetuned from the same base model and it can be combined with other adapters like [ControlNet](../using-diffusers/controlnet). The key idea behind IP-Adapter is the *decoupled cross-attention* mechanism which adds a separate cross-attention layer just for image features instead of using the same cross-attention layer for both text and image features. This allows the model to learn more image-specific features.
|
||||
|
||||
> [!TIP]
|
||||
> Learn how to load an IP-Adapter in the [Load adapters](../using-diffusers/loading_adapters#ip-adapter) guide, and make sure you check out the [IP-Adapter Plus](../using-diffusers/loading_adapters#ip-adapter-plus) section which requires manually loading the image encoder.
|
||||
|
||||
This guide will walk you through using IP-Adapter for various tasks and use cases.
|
||||
|
||||
## General tasks
|
||||
|
||||
Let's take a look at how to use IP-Adapter's image prompting capabilities with the [`StableDiffusionXLPipeline`] for tasks like text-to-image, image-to-image, and inpainting. We also encourage you to try out other pipelines such as Stable Diffusion, LCM-LoRA, ControlNet, T2I-Adapter, or AnimateDiff!
|
||||
|
||||
In all the following examples, you'll see the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method. This method controls the amount of text or image conditioning to apply to the model. A value of `1.0` means the model is only conditioned on the image prompt. Lowering this value encourages the model to produce more diverse images, but they may not be as aligned with the image prompt. Typically, a value of `0.5` achieves a good balance between the two prompt types and produces good results.
|
||||
|
||||
> [!TIP]
|
||||
> In the examples below, try adding `low_cpu_mem_usage=True` to the [`~loaders.IPAdapterMixin.load_ip_adapter`] method to speed up the loading time.
|
||||
|
||||
<hfoptions id="tasks">
|
||||
<hfoption id="Text-to-image">
|
||||
|
||||
Crafting the precise text prompt to generate the image you want can be difficult because it may not always capture what you'd like to express. Adding an image alongside the text prompt helps the model better understand what it should generate and can lead to more accurate results.
|
||||
|
||||
Load a Stable Diffusion XL (SDXL) model and insert an IP-Adapter into the model with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method. Use the `subfolder` parameter to load the SDXL model weights.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
|
||||
pipeline.set_ip_adapter_scale(0.6)
|
||||
```
|
||||
|
||||
Create a text prompt and load an image prompt before passing them to the pipeline to generate an image.
|
||||
|
||||
```py
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png")
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
images = pipeline(
|
||||
prompt="a polar bear sitting in a chair drinking a milkshake",
|
||||
ip_adapter_image=image,
|
||||
negative_prompt="deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=100,
|
||||
generator=generator,
|
||||
).images
|
||||
images[0]
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner_2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Image-to-image">
|
||||
|
||||
IP-Adapter can also help with image-to-image by guiding the model to generate an image that resembles the original image and the image prompt.
|
||||
|
||||
Load a Stable Diffusion XL (SDXL) model and insert an IP-Adapter into the model with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method. Use the `subfolder` parameter to load the SDXL model weights.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
|
||||
pipeline.set_ip_adapter_scale(0.6)
|
||||
```
|
||||
|
||||
Pass the original image and the IP-Adapter image prompt to the pipeline to generate an image. Providing a text prompt to the pipeline is optional, but in this example, a text prompt is used to increase image quality.
|
||||
|
||||
```py
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_1.png")
|
||||
ip_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_2.png")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(4)
|
||||
images = pipeline(
|
||||
prompt="best quality, high quality",
|
||||
image=image,
|
||||
ip_adapter_image=ip_image,
|
||||
generator=generator,
|
||||
strength=0.6,
|
||||
).images
|
||||
images[0]
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_1.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_3.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Inpainting">
|
||||
|
||||
IP-Adapter is also useful for inpainting because the image prompt allows you to be much more specific about what you'd like to generate.
|
||||
|
||||
Load a Stable Diffusion XL (SDXL) model and insert an IP-Adapter into the model with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method. Use the `subfolder` parameter to load the SDXL model weights.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForInpainting
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForInpainting.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
|
||||
pipeline.set_ip_adapter_scale(0.6)
|
||||
```
|
||||
|
||||
Pass a prompt, the original image, mask image, and the IP-Adapter image prompt to the pipeline to generate an image.
|
||||
|
||||
```py
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_mask.png")
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_1.png")
|
||||
ip_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_gummy.png")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(4)
|
||||
images = pipeline(
|
||||
prompt="a cute gummy bear waving",
|
||||
image=image,
|
||||
mask_image=mask_image,
|
||||
ip_adapter_image=ip_image,
|
||||
generator=generator,
|
||||
num_inference_steps=100,
|
||||
).images
|
||||
images[0]
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_bear_1.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_gummy.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_inpaint.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Video">
|
||||
|
||||
IP-Adapter can also help you generate videos that are more aligned with your text prompt. For example, let's load [AnimateDiff](../api/pipelines/animatediff) with its motion adapter and insert an IP-Adapter into the model with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method.
|
||||
|
||||
> [!WARNING]
|
||||
> If you're planning on offloading the model to the CPU, make sure you run it after you've loaded the IP-Adapter. When you call [`~DiffusionPipeline.enable_model_cpu_offload`] before loading the IP-Adapter, it offloads the image encoder module to the CPU and it'll return an error when you try to run the pipeline.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
|
||||
from diffusers.utils import export_to_gif
|
||||
from diffusers.utils import load_image
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
|
||||
pipeline = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
|
||||
scheduler = DDIMScheduler.from_pretrained(
|
||||
"emilianJR/epiCRealism",
|
||||
subfolder="scheduler",
|
||||
clip_sample=False,
|
||||
timestep_spacing="linspace",
|
||||
beta_schedule="linear",
|
||||
steps_offset=1,
|
||||
)
|
||||
pipeline.scheduler = scheduler
|
||||
pipeline.enable_vae_slicing()
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Pass a prompt and an image prompt to the pipeline to generate a short video.
|
||||
|
||||
```py
|
||||
ip_adapter_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_inpaint.png")
|
||||
|
||||
output = pipeline(
|
||||
prompt="A cute gummy bear waving",
|
||||
negative_prompt="bad quality, worse quality, low resolution",
|
||||
ip_adapter_image=ip_adapter_image,
|
||||
num_frames=16,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=50,
|
||||
generator=torch.Generator(device="cpu").manual_seed(0),
|
||||
)
|
||||
frames = output.frames[0]
|
||||
export_to_gif(frames, "gummy_bear.gif")
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_inpaint.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gummy_bear.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Configure parameters
|
||||
|
||||
There are a couple of IP-Adapter parameters that are useful to know about and can help you with your image generation tasks. These parameters can make your workflow more efficient or give you more control over image generation.
|
||||
|
||||
### Image embeddings
|
||||
|
||||
IP-Adapter enabled pipelines provide the `ip_adapter_image_embeds` parameter to accept precomputed image embeddings. This is particularly useful in scenarios where you need to run the IP-Adapter pipeline multiple times because you have more than one image. For example, [multi IP-Adapter](#multi-ip-adapter) is a specific use case where you provide multiple styling images to generate a specific image in a specific style. Loading and encoding multiple images each time you use the pipeline would be inefficient. Instead, you can precompute and save the image embeddings to disk (which can save a lot of space if you're using high-quality images) and load them when you need them.
|
||||
|
||||
> [!TIP]
|
||||
> This parameter also gives you the flexibility to load embeddings from other sources. For example, ComfyUI image embeddings for IP-Adapters are compatible with Diffusers and should work ouf-of-the-box!
|
||||
|
||||
Call the [`~StableDiffusionPipeline.prepare_ip_adapter_image_embeds`] method to encode and generate the image embeddings. Then you can save them to disk with `torch.save`.
|
||||
|
||||
> [!TIP]
|
||||
> If you're using IP-Adapter with `ip_adapter_image_embedding` instead of `ip_adapter_image`', you can set `load_ip_adapter(image_encoder_folder=None,...)` because you don't need to load an encoder to generate the image embeddings.
|
||||
|
||||
```py
|
||||
image_embeds = pipeline.prepare_ip_adapter_image_embeds(
|
||||
ip_adapter_image=image,
|
||||
ip_adapter_image_embeds=None,
|
||||
device="cuda",
|
||||
num_images_per_prompt=1,
|
||||
do_classifier_free_guidance=True,
|
||||
)
|
||||
|
||||
torch.save(image_embeds, "image_embeds.ipadpt")
|
||||
```
|
||||
|
||||
Now load the image embeddings by passing them to the `ip_adapter_image_embeds` parameter.
|
||||
|
||||
```py
|
||||
image_embeds = torch.load("image_embeds.ipadpt")
|
||||
images = pipeline(
|
||||
prompt="a polar bear sitting in a chair drinking a milkshake",
|
||||
ip_adapter_image_embeds=image_embeds,
|
||||
negative_prompt="deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=100,
|
||||
generator=generator,
|
||||
).images
|
||||
```
|
||||
|
||||
### IP-Adapter masking
|
||||
|
||||
Binary masks specify which portion of the output image should be assigned to an IP-Adapter. This is useful for composing more than one IP-Adapter image. For each input IP-Adapter image, you must provide a binary mask an an IP-Adapter.
|
||||
|
||||
To start, preprocess the input IP-Adapter images with the [`~image_processor.IPAdapterMaskProcessor.preprocess()`] to generate their masks. For optimal results, provide the output height and width to [`~image_processor.IPAdapterMaskProcessor.preprocess()`]. This ensures masks with different aspect ratios are appropriately stretched. If the input masks already match the aspect ratio of the generated image, you don't have to set the `height` and `width`.
|
||||
|
||||
```py
|
||||
from diffusers.image_processor import IPAdapterMaskProcessor
|
||||
|
||||
mask1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_mask1.png")
|
||||
mask2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_mask2.png")
|
||||
|
||||
output_height = 1024
|
||||
output_width = 1024
|
||||
|
||||
processor = IPAdapterMaskProcessor()
|
||||
masks = processor.preprocess([mask1, mask2], height=output_height, width=output_width)
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask1.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">mask one</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">mask two</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
When there is more than one input IP-Adapter image, load them as a list to ensure each image is assigned to a different IP-Adapter. Each of the input IP-Adapter images here correspond to the masks generated above.
|
||||
|
||||
```py
|
||||
face_image1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png")
|
||||
face_image2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl2.png")
|
||||
|
||||
ip_images = [[face_image1], [face_image2]]
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl1.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image one</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image two</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Now pass the preprocessed masks to `cross_attention_kwargs` in the pipeline call.
|
||||
|
||||
```py
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] * 2)
|
||||
pipeline.set_ip_adapter_scale([0.7] * 2)
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
num_images = 1
|
||||
|
||||
image = pipeline(
|
||||
prompt="2 girls",
|
||||
ip_adapter_image=ip_images,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=20,
|
||||
num_images_per_prompt=num_images,
|
||||
generator=generator,
|
||||
cross_attention_kwargs={"ip_adapter_masks": masks}
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_attention_mask_result_seed_0.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter masking applied</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_no_attention_mask_result_seed_0.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">no IP-Adapter masking applied</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Specific use cases
|
||||
|
||||
IP-Adapter's image prompting and compatibility with other adapters and models makes it a versatile tool for a variety of use cases. This section covers some of the more popular applications of IP-Adapter, and we can't wait to see what you come up with!
|
||||
|
||||
### Face model
|
||||
|
||||
Generating accurate faces is challenging because they are complex and nuanced. Diffusers supports two IP-Adapter checkpoints specifically trained to generate faces:
|
||||
|
||||
* [ip-adapter-full-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-full-face_sd15.safetensors) is conditioned with images of cropped faces and removed backgrounds
|
||||
* [ip-adapter-plus-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-plus-face_sd15.safetensors) uses patch embeddings and is conditioned with images of cropped faces
|
||||
|
||||
> [!TIP]
|
||||
>
|
||||
> [IP-Adapter-FaceID](https://huggingface.co/h94/IP-Adapter-FaceID) is a face-specific IP-Adapter trained with face ID embeddings instead of CLIP image embeddings, allowing you to generate more consistent faces in different contexts and styles. Try out this popular [community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community#ip-adapter-face-id) and see how it compares to the other face IP-Adapters.
|
||||
|
||||
For face models, use the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) checkpoint. It is also recommended to use [`DDIMScheduler`] or [`EulerDiscreteScheduler`] for face models.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin")
|
||||
|
||||
pipeline.set_ip_adapter_scale(0.5)
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_einstein_base.png")
|
||||
generator = torch.Generator(device="cpu").manual_seed(26)
|
||||
|
||||
image = pipeline(
|
||||
prompt="A photo of Einstein as a chef, wearing an apron, cooking in a French restaurant",
|
||||
ip_adapter_image=image,
|
||||
negative_prompt="lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=100,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_einstein_base.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_einstein.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
### Multi IP-Adapter
|
||||
|
||||
More than one IP-Adapter can be used at the same time to generate specific images in more diverse styles. For example, you can use IP-Adapter-Face to generate consistent faces and characters, and IP-Adapter Plus to generate those faces in a specific style.
|
||||
|
||||
> [!TIP]
|
||||
> Read the [IP-Adapter Plus](../using-diffusers/loading_adapters#ip-adapter-plus) section to learn why you need to manually load the image encoder.
|
||||
|
||||
Load the image encoder with [`~transformers.CLIPVisionModelWithProjection`].
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image, DDIMScheduler
|
||||
from transformers import CLIPVisionModelWithProjection
|
||||
from diffusers.utils import load_image
|
||||
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
||||
"h94/IP-Adapter",
|
||||
subfolder="models/image_encoder",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
```
|
||||
|
||||
Next, you'll load a base model, scheduler, and the IP-Adapters. The IP-Adapters to use are passed as a list to the `weight_name` parameter:
|
||||
|
||||
* [ip-adapter-plus_sdxl_vit-h](https://huggingface.co/h94/IP-Adapter#ip-adapter-for-sdxl-10) uses patch embeddings and a ViT-H image encoder
|
||||
* [ip-adapter-plus-face_sdxl_vit-h](https://huggingface.co/h94/IP-Adapter#ip-adapter-for-sdxl-10) has the same architecture but it is conditioned with images of cropped faces
|
||||
|
||||
```py
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
image_encoder=image_encoder,
|
||||
)
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.load_ip_adapter(
|
||||
"h94/IP-Adapter",
|
||||
subfolder="sdxl_models",
|
||||
weight_name=["ip-adapter-plus_sdxl_vit-h.safetensors", "ip-adapter-plus-face_sdxl_vit-h.safetensors"]
|
||||
)
|
||||
pipeline.set_ip_adapter_scale([0.7, 0.3])
|
||||
pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Load an image prompt and a folder containing images of a certain style you want to use.
|
||||
|
||||
```py
|
||||
face_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/women_input.png")
|
||||
style_folder = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/style_ziggy"
|
||||
style_images = [load_image(f"{style_folder}/img{i}.png") for i in range(10)]
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/women_input.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image of face</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_style_grid.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter style images</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Pass the image prompt and style images as a list to the `ip_adapter_image` parameter, and run the pipeline!
|
||||
|
||||
```py
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
image = pipeline(
|
||||
prompt="wonderwoman",
|
||||
ip_adapter_image=[style_images, face_image],
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=50, num_images_per_prompt=1,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_multi_out.png" />
|
||||
</div>
|
||||
|
||||
### Instant generation
|
||||
|
||||
[Latent Consistency Models (LCM)](../using-diffusers/inference_with_lcm_lora) are diffusion models that can generate images in as little as 4 steps compared to other diffusion models like SDXL that typically require way more steps. This is why image generation with an LCM feels "instantaneous". IP-Adapters can be plugged into an LCM-LoRA model to instantly generate images with an image prompt.
|
||||
|
||||
The IP-Adapter weights need to be loaded first, then you can use [`~StableDiffusionPipeline.load_lora_weights`] to load the LoRA style and weight you want to apply to your image.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, LCMScheduler
|
||||
import torch
|
||||
from diffusers.utils import load_image
|
||||
|
||||
model_id = "sd-dreambooth-library/herge-style"
|
||||
lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
pipeline.load_lora_weights(lcm_lora_id)
|
||||
pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Try using with a lower IP-Adapter scale to condition image generation more on the [herge_style](https://huggingface.co/sd-dreambooth-library/herge-style) checkpoint, and remember to use the special token `herge_style` in your prompt to trigger and apply the style.
|
||||
|
||||
```py
|
||||
pipeline.set_ip_adapter_scale(0.4)
|
||||
|
||||
prompt = "herge_style woman in armor, best quality, high quality"
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
ip_adapter_image = load_image("https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png")
|
||||
image = pipeline(
|
||||
prompt=prompt,
|
||||
ip_adapter_image=ip_adapter_image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=1,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_herge.png" />
|
||||
</div>
|
||||
|
||||
### Structural control
|
||||
|
||||
To control image generation to an even greater degree, you can combine IP-Adapter with a model like [ControlNet](../using-diffusers/controlnet). A ControlNet is also an adapter that can be inserted into a diffusion model to allow for conditioning on an additional control image. The control image can be depth maps, edge maps, pose estimations, and more.
|
||||
|
||||
Load a [`ControlNetModel`] checkpoint conditioned on depth maps, insert it into a diffusion model, and load the IP-Adapter.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
|
||||
import torch
|
||||
from diffusers.utils import load_image
|
||||
|
||||
controlnet_model_path = "lllyasviel/control_v11f1p_sd15_depth"
|
||||
controlnet = ControlNetModel.from_pretrained(controlnet_model_path, torch_dtype=torch.float16)
|
||||
|
||||
pipeline = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16)
|
||||
pipeline.to("cuda")
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
```
|
||||
|
||||
Now load the IP-Adapter image and depth map.
|
||||
|
||||
```py
|
||||
ip_adapter_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png")
|
||||
depth_map = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/depth.png")
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/depth.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">depth map</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Pass the depth map and IP-Adapter image to the pipeline to generate an image.
|
||||
|
||||
```py
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
image = pipeline(
|
||||
prompt="best quality, high quality",
|
||||
image=depth_map,
|
||||
ip_adapter_image=ip_adapter_image,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ipa-controlnet-out.png" />
|
||||
</div>
|
||||
@@ -60,23 +60,6 @@ repo_id = "runwayml/stable-diffusion-v1-5"
|
||||
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(repo_id)
|
||||
```
|
||||
|
||||
You can use the Space below to gauge the memory requirements of a pipeline you want to load beforehand without downloading the pipeline checkpoints:
|
||||
|
||||
<div class="block dark:hidden">
|
||||
<iframe
|
||||
src="https://diffusers-compute-pipeline-size.hf.space?__theme=light"
|
||||
width="850"
|
||||
height="1600"
|
||||
></iframe>
|
||||
</div>
|
||||
<div class="hidden dark:block">
|
||||
<iframe
|
||||
src="https://diffusers-compute-pipeline-size.hf.space?__theme=dark"
|
||||
width="850"
|
||||
height="1600"
|
||||
></iframe>
|
||||
</div>
|
||||
|
||||
### Local pipeline
|
||||
|
||||
To load a diffusion pipeline locally, use [`git-lfs`](https://git-lfs.github.com/) to manually download the checkpoint (in this case, [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) to your local disk. This creates a local folder, `./stable-diffusion-v1-5`, on your disk:
|
||||
|
||||
@@ -103,7 +103,7 @@ image
|
||||
|
||||
<Tip>
|
||||
|
||||
LoRA is a very general training technique that can be used with other training methods. For example, it is common to train a model with DreamBooth and LoRA. It is also increasingly common to load and merge multiple LoRAs to create new and unique images. You can learn more about it in the in-depth [Merge LoRAs](merge_loras) guide since merging is outside the scope of this loading guide.
|
||||
LoRA is a very general training technique that can be used with other training methods. For example, it is common to train a model with DreamBooth and LoRA.
|
||||
|
||||
</Tip>
|
||||
|
||||
@@ -165,14 +165,101 @@ To unload the LoRA weights, use the [`~loaders.LoraLoaderMixin.unload_lora_weigh
|
||||
pipeline.unload_lora_weights()
|
||||
```
|
||||
|
||||
### Load multiple LoRAs
|
||||
|
||||
It can be fun to use multiple LoRAs together to create something entirely new and unique. The [`~loaders.LoraLoaderMixin.fuse_lora`] method allows you to fuse the LoRA weights with the original weights of the underlying model.
|
||||
|
||||
<Tip>
|
||||
|
||||
Fusing the weights can lead to a speedup in inference latency because you don't need to separately load the base model and LoRA! You can save your fused pipeline with [`~DiffusionPipeline.save_pretrained`] to avoid loading and fusing the weights every time you want to use the model.
|
||||
|
||||
</Tip>
|
||||
|
||||
Load an initial model:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLPipeline, AutoencoderKL
|
||||
import torch
|
||||
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
vae=vae,
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Next, load the LoRA checkpoint and fuse it with the original weights. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make the `lora_scale` adjustments in the [`~loaders.LoraLoaderMixin.fuse_lora`] method because it won't work if you try to pass `scale` to the `cross_attention_kwargs` in the pipeline.
|
||||
|
||||
If you need to reset the original model weights for any reason (use a different `lora_scale`), you should use the [`~loaders.LoraLoaderMixin.unfuse_lora`] method.
|
||||
|
||||
```py
|
||||
pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl")
|
||||
pipeline.fuse_lora(lora_scale=0.7)
|
||||
|
||||
# to unfuse the LoRA weights
|
||||
pipeline.unfuse_lora()
|
||||
```
|
||||
|
||||
Then fuse this pipeline with the next set of LoRA weights:
|
||||
|
||||
```py
|
||||
pipeline.load_lora_weights("ostris/super-cereal-sdxl-lora")
|
||||
pipeline.fuse_lora(lora_scale=0.7)
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
You can't unfuse multiple LoRA checkpoints, so if you need to reset the model to its original weights, you'll need to reload it.
|
||||
|
||||
</Tip>
|
||||
|
||||
Now you can generate an image that uses the weights from both LoRAs:
|
||||
|
||||
```py
|
||||
prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration"
|
||||
image = pipeline(prompt).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
### 🤗 PEFT
|
||||
|
||||
<Tip>
|
||||
|
||||
Read the [Inference with 🤗 PEFT](../tutorials/using_peft_for_inference) tutorial to learn more about its integration with 🤗 Diffusers and how you can easily work with and juggle multiple adapters. You'll need to install 🤗 Diffusers and PEFT from source to run the example in this section.
|
||||
|
||||
</Tip>
|
||||
|
||||
Another way you can load and use multiple LoRAs is to specify the `adapter_name` parameter in [`~loaders.LoraLoaderMixin.load_lora_weights`]. This method takes advantage of the 🤗 PEFT integration. For example, load and name both LoRA weights:
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
|
||||
pipeline.load_lora_weights("ostris/super-cereal-sdxl-lora", weight_name="cereal_box_sdxl_v1.safetensors", adapter_name="cereal")
|
||||
```
|
||||
|
||||
Now use the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] to activate both LoRAs, and you can configure how much weight each LoRA should have on the output:
|
||||
|
||||
```py
|
||||
pipeline.set_adapters(["ikea", "cereal"], adapter_weights=[0.7, 0.5])
|
||||
```
|
||||
|
||||
Then, generate an image:
|
||||
|
||||
```py
|
||||
prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration"
|
||||
image = pipeline(prompt, num_inference_steps=30, cross_attention_kwargs={"scale": 1.0}).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
### Kohya and TheLastBen
|
||||
|
||||
Other popular LoRA trainers from the community include those by [Kohya](https://github.com/kohya-ss/sd-scripts/) and [TheLastBen](https://github.com/TheLastBen/fast-stable-diffusion). These trainers create different LoRA checkpoints than those trained by 🤗 Diffusers, but they can still be loaded in the same way.
|
||||
|
||||
<hfoptions id="other-trainers">
|
||||
<hfoption id="Kohya">
|
||||
|
||||
To load a Kohya LoRA, let's download the [Blueprintify SD XL 1.0](https://civitai.com/models/150986/blueprintify-sd-xl-10) checkpoint from [Civitai](https://civitai.com/) as an example:
|
||||
Let's download the [Blueprintify SD XL 1.0](https://civitai.com/models/150986/blueprintify-sd-xl-10) checkpoint from [Civitai](https://civitai.com/):
|
||||
|
||||
```sh
|
||||
!wget https://civitai.com/api/download/models/168776 -O blueprintify-sd-xl-10.safetensors
|
||||
@@ -206,9 +293,6 @@ Some limitations of using Kohya LoRAs with 🤗 Diffusers include:
|
||||
|
||||
</Tip>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="TheLastBen">
|
||||
|
||||
Loading a checkpoint from TheLastBen is very similar. For example, to load the [TheLastBen/William_Eggleston_Style_SDXL](https://huggingface.co/TheLastBen/William_Eggleston_Style_SDXL) checkpoint:
|
||||
|
||||
```py
|
||||
@@ -224,74 +308,443 @@ image = pipeline(prompt=prompt).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
## IP-Adapter
|
||||
|
||||
## IP-Adapter
|
||||
[IP-Adapter](https://ip-adapter.github.io/) is an effective and lightweight adapter that adds image prompting capabilities to a diffusion model. This adapter works by decoupling the cross-attention layers of the image and text features. All the other model components are frozen and only the embedded image features in the UNet are trained. As a result, IP-Adapter files are typically only ~100MBs.
|
||||
|
||||
[IP-Adapter](https://ip-adapter.github.io/) is a lightweight adapter that enables image prompting for any diffusion model. This adapter works by decoupling the cross-attention layers of the image and text features. All the other model components are frozen and only the embedded image features in the UNet are trained. As a result, IP-Adapter files are typically only ~100MBs.
|
||||
IP-Adapter works with most of our pipelines, including Stable Diffusion, Stable Diffusion XL (SDXL), ControlNet, T2I-Adapter, AnimateDiff. And you can use any custom models finetuned from the same base models. It also works with LCM-Lora out of box.
|
||||
|
||||
You can learn more about how to use IP-Adapter for different tasks and specific use cases in the [IP-Adapter](../using-diffusers/ip_adapter) guide.
|
||||
|
||||
> [!TIP]
|
||||
> Diffusers currently only supports IP-Adapter for some of the most popular pipelines. Feel free to open a feature request if you have a cool use case and want to integrate IP-Adapter with an unsupported pipeline!
|
||||
> Official IP-Adapter checkpoints are available from [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter).
|
||||
<Tip>
|
||||
|
||||
To start, load a Stable Diffusion checkpoint.
|
||||
You can find official IP-Adapter checkpoints in [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter).
|
||||
|
||||
IP-Adapter was contributed by [okotaku](https://github.com/okotaku).
|
||||
|
||||
</Tip>
|
||||
|
||||
Let's first create a Stable Diffusion Pipeline.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
from diffusers.utils import load_image
|
||||
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
|
||||
```
|
||||
|
||||
Then load the IP-Adapter weights and add it to the pipeline with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method.
|
||||
Now load the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) weights with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method.
|
||||
|
||||
```py
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
```
|
||||
|
||||
Once loaded, you can use the pipeline with an image and text prompt to guide the image generation process.
|
||||
<Tip>
|
||||
IP-Adapter relies on an image encoder to generate the image features, if your IP-Adapter weights folder contains a "image_encoder" subfolder, the image encoder will be automatically loaded and registered to the pipeline. Otherwise you can so load a [`~transformers.CLIPVisionModelWithProjection`] model and pass it to a Stable Diffusion pipeline when you create it.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
from transformers import CLIPVisionModelWithProjection
|
||||
import torch
|
||||
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
||||
"h94/IP-Adapter",
|
||||
subfolder="models/image_encoder",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, torch_dtype=torch.float16).to("cuda")
|
||||
```
|
||||
</Tip>
|
||||
|
||||
IP-Adapter allows you to use both image and text to condition the image generation process. For example, let's use the bear image from the [Textual Inversion](#textual-inversion) section as the image prompt (`ip_adapter_image`) along with a text prompt to add "sunglasses". 😎
|
||||
|
||||
```py
|
||||
pipeline.set_ip_adapter_scale(0.6)
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png")
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
images = pipeline(
|
||||
prompt='best quality, high quality, wearing sunglasses',
|
||||
prompt='best quality, high quality, wearing sunglasses',
|
||||
ip_adapter_image=image,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
images
|
||||
).images
|
||||
images[0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip-bear.png" />
|
||||
</div>
|
||||
|
||||
### IP-Adapter Plus
|
||||
<Tip>
|
||||
|
||||
IP-Adapter relies on an image encoder to generate image features. If the IP-Adapter repository contains an `image_encoder` subfolder, the image encoder is automatically loaded and registered to the pipeline. Otherwise, you'll need to explicitly load the image encoder with a [`~transformers.CLIPVisionModelWithProjection`] model and pass it to the pipeline.
|
||||
You can use the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method to adjust the text prompt and image prompt condition ratio. If you're only using the image prompt, you should set the scale to `1.0`. You can lower the scale to get more generation diversity, but it'll be less aligned with the prompt.
|
||||
`scale=0.5` can achieve good results in most cases when you use both text and image prompts.
|
||||
</Tip>
|
||||
|
||||
This is the case for *IP-Adapter Plus* checkpoints which use the ViT-H image encoder.
|
||||
IP-Adapter also works great with Image-to-Image and Inpainting pipelines. See below examples of how you can use it with Image-to-Image and Inpaint.
|
||||
|
||||
<hfoptions id="tasks">
|
||||
<hfoption id="image-to-image">
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
import torch
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
|
||||
ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
images = pipeline(
|
||||
prompt='best quality, high quality',
|
||||
image = image,
|
||||
ip_adapter_image=ip_image,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
strength=0.6,
|
||||
).images
|
||||
images[0]
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="inpaint">
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForInpaint
|
||||
import torch
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = AutoPipelineForInpaint.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float).to("cuda")
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/inpaint_image.png")
|
||||
mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/mask.png")
|
||||
ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/girl.png")
|
||||
|
||||
image = image.resize((512, 768))
|
||||
mask = mask.resize((512, 768))
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
images = pipeline(
|
||||
prompt='best quality, high quality',
|
||||
image = image,
|
||||
mask_image = mask,
|
||||
ip_adapter_image=ip_image,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
strength=0.5,
|
||||
).images
|
||||
images[0]
|
||||
```
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
IP-Adapters can also be used with [SDXL](../api/pipelines/stable_diffusion/stable_diffusion_xl.md)
|
||||
|
||||
```python
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/watercolor_painting.jpeg")
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
image = pipeline(
|
||||
prompt="best quality, high quality",
|
||||
ip_adapter_image=image,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=25,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image.save("sdxl_t2i.png")
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/watercolor_painting.jpeg"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">input image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/sdxl_t2i.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">adapted image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
You can use the IP-Adapter face model to apply specific faces to your images. It is an effective way to maintain consistent characters in your image generations.
|
||||
Weights are loaded with the same method used for the other IP-Adapters.
|
||||
|
||||
```python
|
||||
# Load ip-adapter-full-face_sd15.bin
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin")
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
It is recommended to use `DDIMScheduler` and `EulerDiscreteScheduler` for face model.
|
||||
|
||||
|
||||
</Tip>
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin")
|
||||
|
||||
pipeline.set_ip_adapter_scale(0.7)
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
|
||||
image = pipeline(
|
||||
prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
|
||||
ip_adapter_image=image,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=50, num_images_per_prompt=1, width=512, height=704,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">input image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ipadapter_full_face_output.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">output image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
You can load multiple IP-Adapter models and use multiple reference images at the same time. In this example we use IP-Adapter-Plus face model to create a consistent character and also use IP-Adapter-Plus model along with 10 images to create a coherent style in the image we generate.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image, DDIMScheduler
|
||||
from transformers import CLIPVisionModelWithProjection
|
||||
from diffusers.utils import load_image
|
||||
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
||||
"h94/IP-Adapter",
|
||||
"h94/IP-Adapter",
|
||||
subfolder="models/image_encoder",
|
||||
torch_dtype=torch.float16
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
image_encoder=image_encoder,
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
)
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.load_ip_adapter(
|
||||
"h94/IP-Adapter",
|
||||
subfolder="sdxl_models",
|
||||
weight_name=["ip-adapter-plus_sdxl_vit-h.safetensors", "ip-adapter-plus-face_sdxl_vit-h.safetensors"]
|
||||
)
|
||||
pipeline.set_ip_adapter_scale([0.7, 0.3])
|
||||
pipeline.enable_model_cpu_offload()
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus_sdxl_vit-h.safetensors")
|
||||
face_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/women_input.png")
|
||||
style_folder = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/style_ziggy"
|
||||
style_images = [load_image(f"{style_folder}/img{i}.png") for i in range(10)]
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
image = pipeline(
|
||||
prompt="wonderwoman",
|
||||
ip_adapter_image=[style_images, face_image],
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=50, num_images_per_prompt=1,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
```
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_style_grid.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">style input image</figcaption>
|
||||
</div>
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/women_input.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">face input image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_multi_out.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">output image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
### LCM-Lora
|
||||
|
||||
You can use IP-Adapter with LCM-Lora to achieve "instant fine-tune" with custom images. Note that you need to load IP-Adapter weights before loading the LCM-Lora weights.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline, LCMScheduler
|
||||
import torch
|
||||
from diffusers.utils import load_image
|
||||
|
||||
model_id = "sd-dreambooth-library/herge-style"
|
||||
lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
|
||||
|
||||
pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
pipe.load_lora_weights(lcm_lora_id)
|
||||
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
prompt = "best quality, high quality"
|
||||
image = load_image("https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png")
|
||||
images = pipe(
|
||||
prompt=prompt,
|
||||
ip_adapter_image=image,
|
||||
num_inference_steps=4,
|
||||
guidance_scale=1,
|
||||
).images[0]
|
||||
```
|
||||
|
||||
### Other pipelines
|
||||
|
||||
IP-Adapter is compatible with any pipeline that (1) uses a text prompt and (2) uses Stable Diffusion or Stable Diffusion XL checkpoint. To use IP-Adapter with a different pipeline, all you need to do is to run `load_ip_adapter()` method after you create the pipeline, and then pass your image to the pipeline as `ip_adapter_image`
|
||||
|
||||
<Tip>
|
||||
|
||||
🤗 Diffusers currently only supports using IP-Adapter with some of the most popular pipelines, feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require integrating IP-adapters with a pipeline that does not support it yet!
|
||||
|
||||
</Tip>
|
||||
|
||||
You can find below examples on how to use IP-Adapter with ControlNet and AnimateDiff.
|
||||
|
||||
<hfoptions id="model">
|
||||
<hfoption id="ControlNet">
|
||||
|
||||
```
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
|
||||
import torch
|
||||
from diffusers.utils import load_image
|
||||
|
||||
controlnet_model_path = "lllyasviel/control_v11f1p_sd15_depth"
|
||||
controlnet = ControlNetModel.from_pretrained(controlnet_model_path, torch_dtype=torch.float16)
|
||||
|
||||
pipeline = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16)
|
||||
pipeline.to("cuda")
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png")
|
||||
depth_map = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/depth.png")
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
images = pipeline(
|
||||
prompt='best quality, high quality',
|
||||
image=depth_map,
|
||||
ip_adapter_image=image,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images
|
||||
images[0]
|
||||
```
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">input image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ipa-controlnet-out.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">adapted image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AnimateDiff">
|
||||
|
||||
```py
|
||||
# animate diff + ip adapter
|
||||
import torch
|
||||
from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
|
||||
from diffusers.utils import export_to_gif, load_image
|
||||
|
||||
# Load the motion adapter
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
|
||||
# load SD 1.5 based finetuned model
|
||||
model_id = "Lykon/DreamShaper"
|
||||
pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)
|
||||
|
||||
# scheduler
|
||||
scheduler = DDIMScheduler(
|
||||
clip_sample=False,
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="linear",
|
||||
timestep_spacing="trailing",
|
||||
steps_offset=1
|
||||
)
|
||||
pipe.scheduler = scheduler
|
||||
|
||||
# enable memory savings
|
||||
pipe.enable_vae_slicing()
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
# load ip_adapter
|
||||
pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
|
||||
# load motion adapters
|
||||
pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-out", adapter_name="zoom-out")
|
||||
pipe.load_lora_weights("guoyww/animatediff-motion-lora-tilt-up", adapter_name="tilt-up")
|
||||
pipe.load_lora_weights("guoyww/animatediff-motion-lora-pan-left", adapter_name="pan-left")
|
||||
|
||||
seed = 42
|
||||
image = load_image("https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png")
|
||||
images = [image] * 3
|
||||
prompts = ["best quality, high quality"] * 3
|
||||
negative_prompt = "bad quality, worst quality"
|
||||
adapter_weights = [[0.75, 0.0, 0.0], [0.0, 0.0, 0.75], [0.0, 0.75, 0.75]]
|
||||
|
||||
# generate
|
||||
output_frames = []
|
||||
for prompt, image, adapter_weight in zip(prompts, images, adapter_weights):
|
||||
pipe.set_adapters(["zoom-out", "tilt-up", "pan-left"], adapter_weights=adapter_weight)
|
||||
output = pipe(
|
||||
prompt= prompt,
|
||||
num_frames=16,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=30,
|
||||
ip_adapter_image = image,
|
||||
generator=torch.Generator("cpu").manual_seed(seed),
|
||||
)
|
||||
frames = output.frames[0]
|
||||
output_frames.extend(frames)
|
||||
|
||||
export_to_gif(output_frames, "test_out_animation.gif")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
@@ -1,266 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Merge LoRAs
|
||||
|
||||
It can be fun and creative to use multiple [LoRAs]((https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora)) together to generate something entirely new and unique. This works by merging multiple LoRA weights together to produce images that are a blend of different styles. Diffusers provides a few methods to merge LoRAs depending on *how* you want to merge their weights, which can affect image quality.
|
||||
|
||||
This guide will show you how to merge LoRAs using the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods. To improve inference speed and reduce memory-usage of merged LoRAs, you'll also see how to use the [`~loaders.LoraLoaderMixin.fuse_lora`] method to fuse the LoRA weights with the original weights of the underlying model.
|
||||
|
||||
For this guide, load a Stable Diffusion XL (SDXL) checkpoint and the [KappaNeuro/studio-ghibli-style]() and [Norod78/sdxl-chalkboarddrawing-lora]() LoRAs with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. You'll need to assign each LoRA an `adapter_name` to combine them later.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
|
||||
pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
|
||||
```
|
||||
|
||||
## set_adapters
|
||||
|
||||
The [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method merges LoRA adapters by concatenating their weighted matrices. Use the adapter name to specify which LoRAs to merge, and the `adapter_weights` parameter to control the scaling for each LoRA. For example, if `adapter_weights=[0.5, 0.5]`, then the merged LoRA output is an average of both LoRAs. Try adjusting the adapter weights to see how it affects the generated image!
|
||||
|
||||
```py
|
||||
pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
prompt = "A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai"
|
||||
image = pipeline(prompt, generator=generator, cross_attention_kwargs={"scale": 1.0}).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lora_merge_set_adapters.png"/>
|
||||
</div>
|
||||
|
||||
## add_weighted_adapter
|
||||
|
||||
> [!WARNING]
|
||||
> This is an experimental method that adds PEFTs [`~peft.LoraModel.add_weighted_adapter`] method to Diffusers to enable more efficient merging methods. Check out this [issue](https://github.com/huggingface/diffusers/issues/6892) if you're interested in learning more about the motivation and design behind this integration.
|
||||
|
||||
The [`~peft.LoraModel.add_weighted_adapter`] method provides access to more efficient merging method such as [TIES and DARE](https://huggingface.co/docs/peft/developer_guides/model_merging). To use these merging methods, make sure you have the latest stable version of Diffusers and PEFT installed.
|
||||
|
||||
```bash
|
||||
pip install -U diffusers peft
|
||||
```
|
||||
|
||||
There are three steps to merge LoRAs with the [`~peft.LoraModel.add_weighted_adapter`] method:
|
||||
|
||||
1. Create a [`~peft.PeftModel`] from the underlying model and LoRA checkpoint.
|
||||
2. Load a base UNet model and the LoRA adapters.
|
||||
3. Merge the adapters using the [`~peft.LoraModel.add_weighted_adapter`] method and the merging method of your choice.
|
||||
|
||||
Let's dive deeper into what these steps entail.
|
||||
|
||||
1. Load a UNet that corresponds to the UNet in the LoRA checkpoint. In this case, both LoRAs use the SDXL UNet as their base model.
|
||||
|
||||
```python
|
||||
from diffusers import UNet2DConditionModel
|
||||
import torch
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
variant="fp16",
|
||||
subfolder="unet",
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Load the SDXL pipeline and the LoRA checkpoints, starting with the [ostris/ikea-instructions-lora-sdxl](https://huggingface.co/ostris/ikea-instructions-lora-sdxl) LoRA.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
variant="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
unet=unet
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
|
||||
```
|
||||
|
||||
Now you'll create a [`~peft.PeftModel`] from the loaded LoRA checkpoint by combining the SDXL UNet and the LoRA UNet from the pipeline.
|
||||
|
||||
```python
|
||||
from peft import get_peft_model, LoraConfig
|
||||
import copy
|
||||
|
||||
sdxl_unet = copy.deepcopy(unet)
|
||||
ikea_peft_model = get_peft_model(
|
||||
sdxl_unet,
|
||||
pipeline.unet.peft_config["ikea"],
|
||||
adapter_name="ikea"
|
||||
)
|
||||
|
||||
original_state_dict = {f"base_model.model.{k}": v for k, v in pipeline.unet.state_dict().items()}
|
||||
ikea_peft_model.load_state_dict(original_state_dict, strict=True)
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> You can optionally push the ikea_peft_model to the Hub by calling `ikea_peft_model.push_to_hub("ikea_peft_model", token=TOKEN)`.
|
||||
|
||||
Repeat this process to create a [`~peft.PeftModel`] from the [lordjia/by-feng-zikai](https://huggingface.co/lordjia/by-feng-zikai) LoRA.
|
||||
|
||||
```python
|
||||
pipeline.delete_adapters("ikea")
|
||||
sdxl_unet.delete_adapters("ikea")
|
||||
|
||||
pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
|
||||
pipeline.set_adapters(adapter_names="feng")
|
||||
|
||||
feng_peft_model = get_peft_model(
|
||||
sdxl_unet,
|
||||
pipeline.unet.peft_config["feng"],
|
||||
adapter_name="feng"
|
||||
)
|
||||
|
||||
original_state_dict = {f"base_model.model.{k}": v for k, v in pipe.unet.state_dict().items()}
|
||||
feng_peft_model.load_state_dict(original_state_dict, strict=True)
|
||||
```
|
||||
|
||||
2. Load a base UNet model and then load the adapters onto it.
|
||||
|
||||
```python
|
||||
from peft import PeftModel
|
||||
|
||||
base_unet = UNet2DConditionModel.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
variant="fp16",
|
||||
subfolder="unet",
|
||||
).to("cuda")
|
||||
|
||||
model = PeftModel.from_pretrained(base_unet, "stevhliu/ikea_peft_model", use_safetensors=True, subfolder="ikea", adapter_name="ikea")
|
||||
model.load_adapter("stevhliu/feng_peft_model", use_safetensors=True, subfolder="feng", adapter_name="feng")
|
||||
```
|
||||
|
||||
3. Merge the adapters using the [`~peft.LoraModel.add_weighted_adapter`] method and the merging method of your choice (learn more about other merging methods in this [blog post](https://huggingface.co/blog/peft_merging)). For this example, let's use the `"dare_linear"` method to merge the LoRAs.
|
||||
|
||||
> [!WARNING]
|
||||
> Keep in mind the LoRAs need to have the same rank to be merged!
|
||||
|
||||
```python
|
||||
model.add_weighted_adapter(
|
||||
adapters=["ikea", "feng"],
|
||||
weights=[1.0, 1.0],
|
||||
combination_type="dare_linear",
|
||||
adapter_name="ikea-feng"
|
||||
)
|
||||
model.set_adapters("ikea-feng")
|
||||
```
|
||||
|
||||
Now you can generate an image with the merged LoRA.
|
||||
|
||||
```python
|
||||
model = model.to(dtype=torch.float16, device="cuda")
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", unet=model, variant="fp16", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ikea-feng-dare-linear.png"/>
|
||||
</div>
|
||||
|
||||
## fuse_lora
|
||||
|
||||
Both the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods require loading the base model and the LoRA adapters separately which incurs some overhead. The [`~loaders.LoraLoaderMixin.fuse_lora`] method allows you to fuse the LoRA weights directly with the original weights of the underlying model. This way, you're only loading the model once which can increase inference and lower memory-usage.
|
||||
|
||||
You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~loaders.LoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.
|
||||
|
||||
For example, if you have a base model and adapters loaded and set as active with the following adapter weights:
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
|
||||
pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
|
||||
|
||||
pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
|
||||
```
|
||||
|
||||
Fuse these LoRAs into the UNet with the [`~loaders.LoraLoaderMixin.fuse_lora`] method. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make the `lora_scale` adjustments in the [`~loaders.LoraLoaderMixin.fuse_lora`] method because it won’t work if you try to pass `scale` to the `cross_attention_kwargs` in the pipeline.
|
||||
|
||||
```py
|
||||
pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0)
|
||||
```
|
||||
|
||||
Then you should use [`~loaders.LoraLoaderMixin.unload_lora_weights`] to unload the LoRA weights since they've already been fused with the underlying base model. Finally, call [`~DiffusionPipeline.save_pretrained`] to save the fused pipeline locally or you could call [`~DiffusionPipeline.push_to_hub`] to push the fused pipeline to the Hub.
|
||||
|
||||
```py
|
||||
pipeline.unload_lora_weights()
|
||||
# save locally
|
||||
pipeline.save_pretrained("path/to/fused-pipeline")
|
||||
# save to the Hub
|
||||
pipeline.push_to_hub("fused-ikea-feng")
|
||||
```
|
||||
|
||||
Now you can quickly load the fused pipeline and use it for inference without needing to separately load the LoRA adapters.
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"username/fused-ikea-feng", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
You can call [`~loaders.LoraLoaderMixin.unfuse_lora`] to restore the original model's weights (for example, if you want to use a different `lora_scale` value). However, this only works if you've only fused one LoRA adapter to the original model. If you've fused multiple LoRAs, you'll need to reload the model.
|
||||
|
||||
```py
|
||||
pipeline.unfuse_lora()
|
||||
```
|
||||
|
||||
### torch.compile
|
||||
|
||||
[torch.compile](../optimization/torch2.0#torchcompile) can speed up your pipeline even more, but the LoRA weights must be fused first and then unloaded. Typically, the UNet is compiled because it is such a computationally intensive component of the pipeline.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
# load base model and LoRAs
|
||||
pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
|
||||
pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
|
||||
|
||||
# activate both LoRAs and set adapter weights
|
||||
pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
|
||||
|
||||
# fuse LoRAs and unload weights
|
||||
pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0)
|
||||
pipeline.unload_lora_weights()
|
||||
|
||||
# torch.compile
|
||||
pipeline.unet.to(memory_format=torch.channels_last)
|
||||
pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", generator=torch.manual_seed(0)).images[0]
|
||||
```
|
||||
|
||||
Learn more about torch.compile in the [Accelerate inference of text-to-image diffusion models](../tutorials/fast_diffusion#torchcompile) guide.
|
||||
|
||||
## Next steps
|
||||
|
||||
For more conceptual details about how each merging method works, take a look at the [🤗 PEFT welcomes new merging methods](https://huggingface.co/blog/peft_merging#concatenation-cat) blog post!
|
||||
@@ -63,12 +63,11 @@ from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipelin
|
||||
import torch
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_single_file(
|
||||
"https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors",
|
||||
torch_dtype=torch.float16
|
||||
"https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
).to("cuda")
|
||||
|
||||
refiner = StableDiffusionXLImg2ImgPipeline.from_single_file(
|
||||
"https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors", torch_dtype=torch.float16
|
||||
"https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
|
||||
@@ -31,31 +31,29 @@ Before you begin, make sure you have the following libraries installed:
|
||||
Model weights may be stored in separate subfolders on the Hub or locally, in which case, you should use the [`~StableDiffusionXLPipeline.from_pretrained`] method:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16")
|
||||
pipeline = pipeline.to("cuda")
|
||||
```
|
||||
|
||||
You can also use the [`~StableDiffusionXLPipeline.from_single_file`] method to load a model checkpoint stored in a single file format (`.ckpt` or `.safetensors`) from the Hub or locally. For this loading method, you need to set `timestep_spacing="trailing"` (feel free to experiment with the other scheduler config values to get better results):
|
||||
You can also use the [`~StableDiffusionXLPipeline.from_single_file`] method to load a model checkpoint stored in a single file format (`.ckpt` or `.safetensors`) from the Hub or locally:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
import torch
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_single_file(
|
||||
"https://huggingface.co/stabilityai/sdxl-turbo/blob/main/sd_xl_turbo_1.0_fp16.safetensors",
|
||||
torch_dtype=torch.float16, variant="fp16")
|
||||
"https://huggingface.co/stabilityai/sdxl-turbo/blob/main/sd_xl_turbo_1.0_fp16.safetensors", torch_dtype=torch.float16)
|
||||
pipeline = pipeline.to("cuda")
|
||||
pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, timestep_spacing="trailing")
|
||||
```
|
||||
|
||||
## Text-to-image
|
||||
|
||||
For text-to-image, pass a text prompt. By default, SDXL Turbo generates a 512x512 image, and that resolution gives the best results. You can try setting the `height` and `width` parameters to 768x768 or 1024x1024, but you should expect quality degradations when doing so.
|
||||
|
||||
Make sure to set `guidance_scale` to 0.0 to disable, as the model was trained without it. A single inference step is enough to generate high quality images.
|
||||
Make sure to set `guidance_scale` to 0.0 to disable, as the model was trained without it. A single inference step is enough to generate high quality images.
|
||||
Increasing the number of steps to 2, 3 or 4 should improve image quality.
|
||||
|
||||
```py
|
||||
@@ -77,7 +75,7 @@ image
|
||||
|
||||
## Image-to-image
|
||||
|
||||
For image-to-image generation, make sure that `num_inference_steps * strength` is larger or equal to 1.
|
||||
For image-to-image generation, make sure that `num_inference_steps * strength` is larger or equal to 1.
|
||||
The image-to-image pipeline will run for `int(num_inference_steps * strength)` steps, e.g. `0.5 * 2.0 = 1` step in
|
||||
our example below.
|
||||
|
||||
@@ -86,14 +84,14 @@ from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
# use from_pipe to avoid consuming additional memory when loading a checkpoint
|
||||
pipeline_image2image = AutoPipelineForImage2Image.from_pipe(pipeline_text2image).to("cuda")
|
||||
pipeline = AutoPipelineForImage2Image.from_pipe(pipeline_text2image).to("cuda")
|
||||
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png")
|
||||
init_image = init_image.resize((512, 512))
|
||||
|
||||
prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
|
||||
|
||||
image = pipeline_image2image(prompt, image=init_image, strength=0.5, guidance_scale=0.0, num_inference_steps=2).images[0]
|
||||
image = pipeline(prompt, image=init_image, strength=0.5, guidance_scale=0.0, num_inference_steps=2).images[0]
|
||||
make_image_grid([init_image, image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
@@ -103,7 +101,7 @@ make_image_grid([init_image, image], rows=1, cols=2)
|
||||
|
||||
## Speed-up SDXL Turbo even more
|
||||
|
||||
- Compile the UNet if you are using PyTorch version 2.0 or higher. The first inference run will be very slow, but subsequent ones will be much faster.
|
||||
- Compile the UNet if you are using PyTorch version 2 or better. The first inference run will be very slow, but subsequent ones will be much faster.
|
||||
|
||||
```py
|
||||
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
@@ -217,9 +217,3 @@ Check your image dimensions to see if they're correct:
|
||||
images.shape
|
||||
# (8, 1, 512, 512, 3)
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
To learn more about how JAX works with Stable Diffusion, you may be interested in reading:
|
||||
|
||||
* [Accelerating Stable Diffusion XL Inference with JAX on Cloud TPU v5e](https://hf.co/blog/sdxl_jax)
|
||||
|
||||
@@ -21,7 +21,7 @@ This guide will show you how to use SVD to generate short videos from images.
|
||||
Before you begin, make sure you have the following libraries installed:
|
||||
|
||||
```py
|
||||
!pip install -q -U diffusers transformers accelerate
|
||||
!pip install -q -U diffusers transformers accelerate
|
||||
```
|
||||
|
||||
The are two variants of this model, [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt). The SVD checkpoint is trained to generate 14 frames and the SVD-XT checkpoint is further finetuned to generate 25 frames.
|
||||
@@ -86,7 +86,7 @@ Video generation is very memory intensive because you're essentially generating
|
||||
+ frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
|
||||
```
|
||||
|
||||
Using all these tricks together should lower the memory requirement to less than 8GB VRAM.
|
||||
Using all these tricks togethere should lower the memory requirement to less than 8GB VRAM.
|
||||
|
||||
## Micro-conditioning
|
||||
|
||||
|
||||
@@ -1,497 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Text or image-to-video
|
||||
|
||||
Driven by the success of text-to-image diffusion models, generative video models are able to generate short clips of video from a text prompt or an initial image. These models extend a pretrained diffusion model to generate videos by adding some type of temporal and/or spatial convolution layer to the architecture. A mixed dataset of images and videos are used to train the model which learns to output a series of video frames based on the text or image conditioning.
|
||||
|
||||
This guide will show you how to generate videos, how to configure video model parameters, and how to control video generation.
|
||||
|
||||
## Popular models
|
||||
|
||||
> [!TIP]
|
||||
> Discover other cool and trending video generation models on the Hub [here](https://huggingface.co/models?pipeline_tag=text-to-video&sort=trending)!
|
||||
|
||||
[Stable Video Diffusions (SVD)](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid), [I2VGen-XL](https://huggingface.co/ali-vilab/i2vgen-xl/), [AnimateDiff](https://huggingface.co/guoyww/animatediff), and [ModelScopeT2V](https://huggingface.co/ali-vilab/text-to-video-ms-1.7b) are popular models used for video diffusion. Each model is distinct. For example, AnimateDiff inserts a motion modeling module into a frozen text-to-image model to generate personalized animated images, whereas SVD is entirely pretrained from scratch with a three-stage training process to generate short high-quality videos.
|
||||
|
||||
### Stable Video Diffusion
|
||||
|
||||
[SVD](../api/pipelines/svd) is based on the Stable Diffusion 2.1 model and it is trained on images, then low-resolution videos, and finally a smaller dataset of high-resolution videos. This model generates a short 2-4 second video from an initial image. You can learn more details about model, like micro-conditioning, in the [Stable Video Diffusion](../using-diffusers/svd) guide.
|
||||
|
||||
Begin by loading the [`StableVideoDiffusionPipeline`] and passing an initial image to generate a video from.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableVideoDiffusionPipeline
|
||||
from diffusers.utils import load_image, export_to_video
|
||||
|
||||
pipeline = StableVideoDiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
|
||||
)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
|
||||
image = image.resize((1024, 576))
|
||||
|
||||
generator = torch.manual_seed(42)
|
||||
frames = pipeline(image, decode_chunk_size=8, generator=generator).frames[0]
|
||||
export_to_video(frames, "generated.mp4", fps=7)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/output_rocket.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
### I2VGen-XL
|
||||
|
||||
[I2VGen-XL](../api/pipelines/i2vgenxl) is a diffusion model that can generate higher resolution videos than SVD and it is also capable of accepting text prompts in addition to images. The model is trained with two hierarchical encoders (detail and global encoder) to better capture low and high-level details in images. These learned details are used to train a video diffusion model which refines the video resolution and details in the generated video.
|
||||
|
||||
You can use I2VGen-XL by loading the [`I2VGenXLPipeline`], and passing a text and image prompt to generate a video.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import I2VGenXLPipeline
|
||||
from diffusers.utils import export_to_gif, load_image
|
||||
|
||||
pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
|
||||
image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
|
||||
image = load_image(image_url).convert("RGB")
|
||||
|
||||
prompt = "Papers were floating in the air on a table in the library"
|
||||
negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
|
||||
generator = torch.manual_seed(8888)
|
||||
|
||||
frames = pipeline(
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
num_inference_steps=50,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=9.0,
|
||||
generator=generator
|
||||
).frames[0]
|
||||
export_to_gif(frames, "i2v.gif")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/i2vgen-xl-example.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
### AnimateDiff
|
||||
|
||||
[AnimateDiff](../api/pipelines/animatediff) is an adapter model that inserts a motion module into a pretrained diffusion model to animate an image. The adapter is trained on video clips to learn motion which is used to condition the generation process to create a video. It is faster and easier to only train the adapter and it can be loaded into most diffusion models, effectively turning them into "video models".
|
||||
|
||||
Start by loading a [`MotionAdapter`].
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
|
||||
```
|
||||
|
||||
Then load a finetuned Stable Diffusion model with the [`AnimateDiffPipeline`].
|
||||
|
||||
```py
|
||||
pipeline = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
|
||||
scheduler = DDIMScheduler.from_pretrained(
|
||||
"emilianJR/epiCRealism",
|
||||
subfolder="scheduler",
|
||||
clip_sample=False,
|
||||
timestep_spacing="linspace",
|
||||
beta_schedule="linear",
|
||||
steps_offset=1,
|
||||
)
|
||||
pipeline.scheduler = scheduler
|
||||
pipeline.enable_vae_slicing()
|
||||
pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Create a prompt and generate the video.
|
||||
|
||||
```py
|
||||
output = pipeline(
|
||||
prompt="A space rocket with trails of smoke behind it launching into space from the desert, 4k, high resolution",
|
||||
negative_prompt="bad quality, worse quality, low resolution",
|
||||
num_frames=16,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=50,
|
||||
generator=torch.Generator("cpu").manual_seed(49),
|
||||
)
|
||||
frames = output.frames[0]
|
||||
export_to_gif(frames, "animation.gif")
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff.gif"/>
|
||||
</div>
|
||||
|
||||
### ModelscopeT2V
|
||||
|
||||
[ModelscopeT2V](../api/pipelines/text_to_video) adds spatial and temporal convolutions and attention to a UNet, and it is trained on image-text and video-text datasets to enhance what it learns during training. The model takes a prompt, encodes it and creates text embeddings which are denoised by the UNet, and then decoded by a VQGAN into a video.
|
||||
|
||||
<Tip>
|
||||
|
||||
ModelScopeT2V generates watermarked videos due to the datasets it was trained on. To use a watermark-free model, try the [cerspense/zeroscope_v2_76w](https://huggingface.co/cerspense/zeroscope_v2_576w) model with the [`TextToVideoSDPipeline`] first, and then upscale it's output with the [cerspense/zeroscope_v2_XL](https://huggingface.co/cerspense/zeroscope_v2_XL) checkpoint using the [`VideoToVideoSDPipeline`].
|
||||
|
||||
</Tip>
|
||||
|
||||
Load a ModelScopeT2V checkpoint into the [`DiffusionPipeline`] along with a prompt to generate a video.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
pipeline.enable_vae_slicing()
|
||||
|
||||
prompt = "Confident teddy bear surfer rides the wave in the tropics"
|
||||
video_frames = pipeline(prompt).frames[0]
|
||||
export_to_video(video_frames, "modelscopet2v.mp4", fps=10)
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/modelscopet2v.gif" />
|
||||
</div>
|
||||
|
||||
## Configure model parameters
|
||||
|
||||
There are a few important parameters you can configure in the pipeline that'll affect the video generation process and quality. Let's take a closer look at what these parameters do and how changing them affects the output.
|
||||
|
||||
### Number of frames
|
||||
|
||||
The `num_frames` parameter determines how many video frames are generated per second. A frame is an image that is played in a sequence of other frames to create motion or a video. This affects video length because the pipeline generates a certain number of frames per second (check a pipeline's API reference for the default value). To increase the video duration, you'll need to increase the `num_frames` parameter.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableVideoDiffusionPipeline
|
||||
from diffusers.utils import load_image, export_to_video
|
||||
|
||||
pipeline = StableVideoDiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-video-diffusion-img2vid", torch_dtype=torch.float16, variant="fp16"
|
||||
)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
|
||||
image = image.resize((1024, 576))
|
||||
|
||||
generator = torch.manual_seed(42)
|
||||
frames = pipeline(image, decode_chunk_size=8, generator=generator, num_frames=25).frames[0]
|
||||
export_to_video(frames, "generated.mp4", fps=7)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/num_frames_14.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">num_frames=14</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/num_frames_25.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">num_frames=25</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
### Guidance scale
|
||||
|
||||
The `guidance_scale` parameter controls how closely aligned the generated video and text prompt or initial image is. A higher `guidance_scale` value means your generated video is more aligned with the text prompt or initial image, while a lower `guidance_scale` value means your generated video is less aligned which could give the model more "creativity" to interpret the conditioning input.
|
||||
|
||||
<Tip>
|
||||
|
||||
SVD uses the `min_guidance_scale` and `max_guidance_scale` parameters for applying guidance to the first and last frames respectively.
|
||||
|
||||
</Tip>
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import I2VGenXLPipeline
|
||||
from diffusers.utils import export_to_gif, load_image
|
||||
|
||||
pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
|
||||
image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
|
||||
image = load_image(image_url).convert("RGB")
|
||||
|
||||
prompt = "Papers were floating in the air on a table in the library"
|
||||
negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
|
||||
generator = torch.manual_seed(0)
|
||||
|
||||
frames = pipeline(
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
num_inference_steps=50,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=1.0,
|
||||
generator=generator
|
||||
).frames[0]
|
||||
export_to_gif(frames, "i2v.gif")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/i2vgen-xl-example.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">guidance_scale=9.0</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guidance_scale_1.0.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">guidance_scale=1.0</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
### Negative prompt
|
||||
|
||||
A negative prompt deters the model from generating things you don’t want it to. This parameter is commonly used to improve overall generation quality by removing poor or bad features such as “low resolution” or “bad details”.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
|
||||
|
||||
pipeline = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
|
||||
scheduler = DDIMScheduler.from_pretrained(
|
||||
"emilianJR/epiCRealism",
|
||||
subfolder="scheduler",
|
||||
clip_sample=False,
|
||||
timestep_spacing="linspace",
|
||||
beta_schedule="linear",
|
||||
steps_offset=1,
|
||||
)
|
||||
pipeline.scheduler = scheduler
|
||||
pipeline.enable_vae_slicing()
|
||||
pipeline.enable_model_cpu_offload()
|
||||
|
||||
output = pipeline(
|
||||
prompt="360 camera shot of a sushi roll in a restaurant",
|
||||
negative_prompt="Distorted, discontinuous, ugly, blurry, low resolution, motionless, static",
|
||||
num_frames=16,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=50,
|
||||
generator=torch.Generator("cpu").manual_seed(0),
|
||||
)
|
||||
frames = output.frames[0]
|
||||
export_to_gif(frames, "animation.gif")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff_no_neg.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">no negative prompt</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff_neg.gif"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">negative prompt applied</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
### Model-specific parameters
|
||||
|
||||
There are some pipeline parameters that are unique to each model such as adjusting the motion in a video or adding noise to the initial image.
|
||||
|
||||
<hfoptions id="special-parameters">
|
||||
<hfoption id="Stable Video Diffusion">
|
||||
|
||||
Stable Video Diffusion provides additional micro-conditioning for the frame rate with the `fps` parameter and for motion with the `motion_bucket_id` parameter. Together, these parameters allow for adjusting the amount of motion in the generated video.
|
||||
|
||||
There is also a `noise_aug_strength` parameter that increases the amount of noise added to the initial image. Varying this parameter affects how similar the generated video and initial image are. A higher `noise_aug_strength` also increases the amount of motion. To learn more, read the [Micro-conditioning](../using-diffusers/svd#micro-conditioning) guide.
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Text2Video-Zero">
|
||||
|
||||
Text2Video-Zero computes the amount of motion to apply to each frame from randomly sampled latents. You can use the `motion_field_strength_x` and `motion_field_strength_y` parameters to control the amount of motion to apply to the x and y-axes of the video. The parameters `t0` and `t1` are the timesteps to apply motion to the latents.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Control video generation
|
||||
|
||||
Video generation can be controlled similar to how text-to-image, image-to-image, and inpainting can be controlled with a [`ControlNetModel`]. The only difference is you need to use the [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor`] so each frame attends to the first frame.
|
||||
|
||||
### Text2Video-Zero
|
||||
|
||||
Text2Video-Zero video generation can be conditioned on pose and edge images for even greater control over a subject's motion in the generated video or to preserve the identity of a subject/object in the video. You can also use Text2Video-Zero with [InstructPix2Pix](../api/pipelines/pix2pix) for editing videos with text.
|
||||
|
||||
<hfoptions id="t2v-zero">
|
||||
<hfoption id="pose control">
|
||||
|
||||
Start by downloading a video and extracting the pose images from it.
|
||||
|
||||
```py
|
||||
from huggingface_hub import hf_hub_download
|
||||
from PIL import Image
|
||||
import imageio
|
||||
|
||||
filename = "__assets__/poses_skeleton_gifs/dance1_corr.mp4"
|
||||
repo_id = "PAIR/Text2Video-Zero"
|
||||
video_path = hf_hub_download(repo_type="space", repo_id=repo_id, filename=filename)
|
||||
|
||||
reader = imageio.get_reader(video_path, "ffmpeg")
|
||||
frame_count = 8
|
||||
pose_images = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]
|
||||
```
|
||||
|
||||
Load a [`ControlNetModel`] for pose estimation and a checkpoint into the [`StableDiffusionControlNetPipeline`]. Then you'll use the [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor`] for the UNet and ControlNet.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
|
||||
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
|
||||
|
||||
model_id = "runwayml/stable-diffusion-v1-5"
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
model_id, controlnet=controlnet, torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
pipeline.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
|
||||
pipeline.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
|
||||
```
|
||||
|
||||
Fix the latents for all the frames, and then pass your prompt and extracted pose images to the model to generate a video.
|
||||
|
||||
```py
|
||||
latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
|
||||
|
||||
prompt = "Darth Vader dancing in a desert"
|
||||
result = pipeline(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
|
||||
imageio.mimsave("video.mp4", result, fps=4)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="edge control">
|
||||
|
||||
Download a video and extract the edges from it.
|
||||
|
||||
```py
|
||||
from huggingface_hub import hf_hub_download
|
||||
from PIL import Image
|
||||
import imageio
|
||||
|
||||
filename = "__assets__/poses_skeleton_gifs/dance1_corr.mp4"
|
||||
repo_id = "PAIR/Text2Video-Zero"
|
||||
video_path = hf_hub_download(repo_type="space", repo_id=repo_id, filename=filename)
|
||||
|
||||
reader = imageio.get_reader(video_path, "ffmpeg")
|
||||
frame_count = 8
|
||||
pose_images = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]
|
||||
```
|
||||
|
||||
Load a [`ControlNetModel`] for canny edge and a checkpoint into the [`StableDiffusionControlNetPipeline`]. Then you'll use the [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor`] for the UNet and ControlNet.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
|
||||
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
|
||||
|
||||
model_id = "runwayml/stable-diffusion-v1-5"
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
model_id, controlnet=controlnet, torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
pipeline.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
|
||||
pipeline.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
|
||||
```
|
||||
|
||||
Fix the latents for all the frames, and then pass your prompt and extracted edge images to the model to generate a video.
|
||||
|
||||
```py
|
||||
latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
|
||||
|
||||
prompt = "Darth Vader dancing in a desert"
|
||||
result = pipeline(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
|
||||
imageio.mimsave("video.mp4", result, fps=4)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="InstructPix2Pix">
|
||||
|
||||
InstructPix2Pix allows you to use text to describe the changes you want to make to the video. Start by downloading and reading a video.
|
||||
|
||||
```py
|
||||
from huggingface_hub import hf_hub_download
|
||||
from PIL import Image
|
||||
import imageio
|
||||
|
||||
filename = "__assets__/pix2pix video/camel.mp4"
|
||||
repo_id = "PAIR/Text2Video-Zero"
|
||||
video_path = hf_hub_download(repo_type="space", repo_id=repo_id, filename=filename)
|
||||
|
||||
reader = imageio.get_reader(video_path, "ffmpeg")
|
||||
frame_count = 8
|
||||
video = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]
|
||||
```
|
||||
|
||||
Load the [`StableDiffusionInstructPix2PixPipeline`] and set the [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor`] for the UNet.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionInstructPix2PixPipeline
|
||||
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
|
||||
|
||||
pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=3))
|
||||
```
|
||||
|
||||
Pass a prompt describing the change you want to apply to the video.
|
||||
|
||||
```py
|
||||
prompt = "make it Van Gogh Starry Night style"
|
||||
result = pipeline(prompt=[prompt] * len(video), image=video).images
|
||||
imageio.mimsave("edited_video.mp4", result, fps=4)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Optimize
|
||||
|
||||
Video generation requires a lot of memory because you're generating many video frames at once. You can reduce your memory requirements at the expense of some inference speed. Try:
|
||||
|
||||
1. offloading pipeline components that are no longer needed to the CPU
|
||||
2. feed-forward chunking runs the feed-forward layer in a loop instead of all at once
|
||||
3. break up the number of frames the VAE has to decode into chunks instead of decoding them all at once
|
||||
|
||||
```diff
|
||||
- pipeline.enable_model_cpu_offload()
|
||||
- frames = pipeline(image, decode_chunk_size=8, generator=generator).frames[0]
|
||||
+ pipeline.enable_model_cpu_offload()
|
||||
+ pipeline.unet.enable_forward_chunking()
|
||||
+ frames = pipeline(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
|
||||
```
|
||||
|
||||
If memory is not an issue and you want to optimize for speed, try wrapping the UNet with [`torch.compile`](../optimization/torch2.0#torchcompile).
|
||||
|
||||
```diff
|
||||
- pipeline.enable_model_cpu_offload()
|
||||
+ pipeline.to("cuda")
|
||||
+ pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
|
||||
```
|
||||
@@ -273,6 +273,7 @@ Lastly, convert the image to a `PIL.Image` to see your generated image!
|
||||
```py
|
||||
>>> image = (image / 2 + 0.5).clamp(0, 1).squeeze()
|
||||
>>> image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
|
||||
>>> images = (image * 255).round().astype("uint8")
|
||||
>>> image = Image.fromarray(image)
|
||||
>>> image
|
||||
```
|
||||
|
||||
@@ -313,12 +313,12 @@ from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipelin
|
||||
import torch
|
||||
|
||||
pipe = StableDiffusionXLPipeline.from_single_file(
|
||||
"./sd_xl_base_1.0.safetensors", torch_dtype=torch.float16
|
||||
"./sd_xl_base_1.0.safetensors", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
)
|
||||
pipe.to("cuda")
|
||||
|
||||
refiner = StableDiffusionXLImg2ImgPipeline.from_single_file(
|
||||
"./sd_xl_refiner_1.0.safetensors", torch_dtype=torch.float16
|
||||
"./sd_xl_refiner_1.0.safetensors", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
|
||||
)
|
||||
refiner.to("cuda")
|
||||
```
|
||||
|
||||
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# 메모리와 속도
|
||||
|
||||
메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다.
|
||||
메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다.
|
||||
일반적으로, memory-efficient attention을 위해 [xFormers](https://github.com/facebookresearch/xformers) 사용을 추천하기 때문에, 추천하는 [설치 방법](xformers)을 보고 설치해 보세요.
|
||||
|
||||
다음 설정이 성능과 메모리에 미치는 영향에 대해 설명합니다.
|
||||
@@ -27,7 +27,7 @@ specific language governing permissions and limitations under the License.
|
||||
| memory-efficient attention | 2.63s | x3.61 |
|
||||
|
||||
<em>
|
||||
NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다.
|
||||
NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다.
|
||||
</em>
|
||||
|
||||
## cuDNN auto-tuner 활성화하기
|
||||
@@ -44,11 +44,11 @@ torch.backends.cudnn.benchmark = True
|
||||
|
||||
### fp32 대신 tf32 사용하기 (Ampere 및 이후 CUDA 장치들에서)
|
||||
|
||||
Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다.
|
||||
기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다.
|
||||
네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다.
|
||||
이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다.
|
||||
그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다.
|
||||
Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다.
|
||||
기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다.
|
||||
네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다.
|
||||
이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다.
|
||||
그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다.
|
||||
추론하기 전에 다음을 추가하기만 하면 됩니다:
|
||||
|
||||
```python
|
||||
@@ -59,13 +59,13 @@ torch.backends.cuda.matmul.allow_tf32 = True
|
||||
|
||||
## 반정밀도 가중치
|
||||
|
||||
더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다.
|
||||
더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다.
|
||||
여기에는 `fp16`이라는 브랜치에 저장된 float16 버전의 가중치를 불러오고, 그 때 `float16` 유형을 사용하도록 PyTorch에 지시하는 작업이 포함됩니다.
|
||||
|
||||
```Python
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
@@ -75,7 +75,7 @@ image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.
|
||||
어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.
|
||||
</Tip>
|
||||
|
||||
## 추가 메모리 절약을 위한 슬라이스 어텐션
|
||||
@@ -95,7 +95,7 @@ from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
@@ -122,7 +122,7 @@ from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
@@ -148,7 +148,7 @@ from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
@@ -165,7 +165,7 @@ image = pipe(prompt).images[0]
|
||||
또 다른 최적화 방법인 <a href="#model_offloading">모델 오프로딩</a>을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다.
|
||||
</Tip>
|
||||
|
||||
또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.
|
||||
또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.
|
||||
|
||||
|
||||
```Python
|
||||
@@ -174,7 +174,7 @@ from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
|
||||
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
@@ -204,7 +204,7 @@ import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
@@ -355,7 +355,7 @@ unet_traced = torch.jit.load("unet_traced.pt")
|
||||
class TracedUNet(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.in_channels = pipe.unet.config.in_channels
|
||||
self.in_channels = pipe.unet.in_channels
|
||||
self.device = pipe.unet.device
|
||||
|
||||
def forward(self, latent_model_input, t, encoder_hidden_states):
|
||||
@@ -387,7 +387,7 @@ with torch.inference_mode():
|
||||
| A100-SXM4-40GB | 18.6it/s | 29.it/s |
|
||||
| A100-SXM-80GB | 18.7it/s | 29.5it/s |
|
||||
|
||||
이를 활용하려면 다음을 만족해야 합니다:
|
||||
이를 활용하려면 다음을 만족해야 합니다:
|
||||
- PyTorch > 1.12
|
||||
- Cuda 사용 가능
|
||||
- [xformers 라이브러리를 설치함](xformers)
|
||||
|
||||
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다.
|
||||
🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다.
|
||||
|
||||
이 튜토리얼에서는 기본 파이프라인부터 시작해 Stable Diffusion 파이프라인까지 진행하며 모델과 스케줄러를 사용해 추론을 위한 diffusion 시스템을 조립하는 방법을 배웁니다.
|
||||
|
||||
@@ -36,7 +36,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
정말 쉽습니다. 그런데 파이프라인은 어떻게 이렇게 할 수 있었을까요? 파이프라인을 세분화하여 내부에서 어떤 일이 일어나고 있는지 살펴보겠습니다.
|
||||
|
||||
위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다.
|
||||
위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다.
|
||||
|
||||
모델과 스케줄러를 별도로 사용하여 파이프라인을 다시 생성하기 위해 자체적인 노이즈 제거 프로세스를 작성해 보겠습니다.
|
||||
|
||||
@@ -210,7 +210,7 @@ Stable Diffusion 은 text-to-image *latent diffusion* 모델입니다. latent di
|
||||
|
||||
```py
|
||||
>>> latents = torch.randn(
|
||||
... (batch_size, unet.config.in_channels, height // 8, width // 8),
|
||||
... (batch_size, unet.in_channels, height // 8, width // 8),
|
||||
... generator=generator,
|
||||
... device=torch_device,
|
||||
... )
|
||||
|
||||
@@ -80,7 +80,8 @@ To do so, just specify `--train_text_encoder_ti` while launching training (for r
|
||||
Please keep the following points in mind:
|
||||
|
||||
* SDXL has two text encoders. So, we fine-tune both using LoRA.
|
||||
* When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memory.
|
||||
* When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memoםהקרry.
|
||||
|
||||
|
||||
### 3D icon example
|
||||
|
||||
@@ -233,76 +234,6 @@ In ComfyUI we will load a LoRA and a textual embedding at the same time.
|
||||
|
||||
SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
|
||||
|
||||
### DoRA training
|
||||
The advanced script now supports DoRA training too!
|
||||
> Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353),
|
||||
**DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters.
|
||||
The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference.
|
||||
|
||||
> [!NOTE]
|
||||
> 💡DoRA training is still _experimental_
|
||||
> and is likely to require different hyperparameter values to perform best compared to a LoRA.
|
||||
> Specifically, we've noticed 2 differences to take into account your training:
|
||||
> 1. **LoRA seem to converge faster than DoRA** (so a set of parameters that may lead to overfitting when training a LoRA may be working well for a DoRA)
|
||||
> 2. **DoRA quality superior to LoRA especially in lower ranks** the difference in quality of DoRA of rank 8 and LoRA of rank 8 appears to be more significant than when training ranks of 32 or 64 for example.
|
||||
> This is also aligned with some of the quantitative analysis shown in the paper.
|
||||
|
||||
**Usage**
|
||||
1. To use DoRA you need to install `peft` from main:
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/peft.git
|
||||
```
|
||||
2. Enable DoRA training by adding this flag
|
||||
```bash
|
||||
--use_dora
|
||||
```
|
||||
**Inference**
|
||||
The inference is the same as if you train a regular LoRA 🤗
|
||||
|
||||
## Conducting EDM-style training
|
||||
|
||||
It's now possible to perform EDM-style training as proposed in [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364).
|
||||
|
||||
simply set:
|
||||
|
||||
```diff
|
||||
+ --do_edm_style_training \
|
||||
```
|
||||
|
||||
Other SDXL-like models that use the EDM formulation, such as [playgroundai/playground-v2.5-1024px-aesthetic](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic), can also be DreamBooth'd with the script. Below is an example command:
|
||||
|
||||
```bash
|
||||
accelerate launch train_dreambooth_lora_sdxl_advanced.py \
|
||||
--pretrained_model_name_or_path="playgroundai/playground-v2.5-1024px-aesthetic" \
|
||||
--dataset_name="linoyts/3d_icon" \
|
||||
--instance_prompt="3d icon in the style of TOK" \
|
||||
--validation_prompt="a TOK icon of an astronaut riding a horse, in the style of TOK" \
|
||||
--output_dir="3d-icon-SDXL-LoRA" \
|
||||
--do_edm_style_training \
|
||||
--caption_column="prompt" \
|
||||
--mixed_precision="bf16" \
|
||||
--resolution=1024 \
|
||||
--train_batch_size=3 \
|
||||
--repeats=1 \
|
||||
--report_to="wandb"\
|
||||
--gradient_accumulation_steps=1 \
|
||||
--gradient_checkpointing \
|
||||
--learning_rate=1.0 \
|
||||
--text_encoder_lr=1.0 \
|
||||
--optimizer="prodigy"\
|
||||
--train_text_encoder_ti\
|
||||
--train_text_encoder_ti_frac=0.5\
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--rank=8 \
|
||||
--max_train_steps=1000 \
|
||||
--checkpointing_steps=2000 \
|
||||
--seed="0" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
> [!CAUTION]
|
||||
> Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant".
|
||||
|
||||
### Tips and Tricks
|
||||
Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)
|
||||
|
||||
@@ -70,14 +70,13 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.27.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def save_model_card(
|
||||
repo_id: str,
|
||||
use_dora: bool,
|
||||
images=None,
|
||||
base_model=str,
|
||||
train_text_encoder=False,
|
||||
@@ -89,7 +88,6 @@ def save_model_card(
|
||||
vae_path=None,
|
||||
):
|
||||
img_str = "widget:\n"
|
||||
lora = "lora" if not use_dora else "dora"
|
||||
for i, image in enumerate(images):
|
||||
image.save(os.path.join(repo_folder, f"image_{i}.png"))
|
||||
img_str += f"""
|
||||
@@ -141,10 +139,9 @@ to trigger concept `{key}` → use `{tokens}` in your prompt \n
|
||||
tags:
|
||||
- stable-diffusion
|
||||
- stable-diffusion-diffusers
|
||||
- diffusers-training
|
||||
- text-to-image
|
||||
- diffusers
|
||||
- {lora}
|
||||
- lora
|
||||
- template:sd-lora
|
||||
{img_str}
|
||||
base_model: {base_model}
|
||||
@@ -654,16 +651,6 @@ def parse_args(input_args=None):
|
||||
default=4,
|
||||
help=("The dimension of the LoRA update matrices."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_dora",
|
||||
type=bool,
|
||||
action="store_true",
|
||||
default=False,
|
||||
help=(
|
||||
"Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
|
||||
"Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_latents",
|
||||
action="store_true",
|
||||
@@ -1215,7 +1202,7 @@ def main(args):
|
||||
|
||||
xformers_version = version.parse(xformers.__version__)
|
||||
if xformers_version == version.parse("0.0.16"):
|
||||
logger.warning(
|
||||
logger.warn(
|
||||
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, "
|
||||
"please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
||||
)
|
||||
@@ -1232,7 +1219,6 @@ def main(args):
|
||||
unet_lora_config = LoraConfig(
|
||||
r=args.rank,
|
||||
lora_alpha=args.rank,
|
||||
use_dora=args.use_dora,
|
||||
init_lora_weights="gaussian",
|
||||
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
|
||||
)
|
||||
@@ -1244,7 +1230,6 @@ def main(args):
|
||||
text_lora_config = LoraConfig(
|
||||
r=args.rank,
|
||||
lora_alpha=args.rank,
|
||||
use_dora=args.use_dora,
|
||||
init_lora_weights="gaussian",
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
|
||||
)
|
||||
@@ -1366,14 +1351,14 @@ def main(args):
|
||||
|
||||
# Optimizer creation
|
||||
if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
|
||||
logger.warning(
|
||||
logger.warn(
|
||||
f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
|
||||
"Defaulting to adamW"
|
||||
)
|
||||
args.optimizer = "adamw"
|
||||
|
||||
if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
|
||||
logger.warning(
|
||||
logger.warn(
|
||||
f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
|
||||
f"set to {args.optimizer.lower()}"
|
||||
)
|
||||
@@ -1407,11 +1392,11 @@ def main(args):
|
||||
optimizer_class = prodigyopt.Prodigy
|
||||
|
||||
if args.learning_rate <= 0.1:
|
||||
logger.warning(
|
||||
logger.warn(
|
||||
"Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
|
||||
)
|
||||
if args.train_text_encoder and args.text_encoder_lr:
|
||||
logger.warning(
|
||||
logger.warn(
|
||||
f"Learning rates were provided both for the unet and the text encoder- e.g. text_encoder_lr:"
|
||||
f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
|
||||
f"When using prodigy only learning_rate is used as the initial learning rate."
|
||||
@@ -1970,7 +1955,6 @@ def main(args):
|
||||
|
||||
save_model_card(
|
||||
model_id if not args.push_to_hub else repo_id,
|
||||
use_dora=args.use_dora,
|
||||
images=images,
|
||||
base_model=args.pretrained_model_name_or_path,
|
||||
train_text_encoder=args.train_text_encoder,
|
||||
|
||||
@@ -14,11 +14,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
|
||||
import argparse
|
||||
import contextlib
|
||||
import gc
|
||||
import hashlib
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
@@ -39,7 +37,7 @@ import transformers
|
||||
from accelerate import Accelerator
|
||||
from accelerate.logging import get_logger
|
||||
from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
|
||||
from huggingface_hub import create_repo, hf_hub_download, upload_folder
|
||||
from huggingface_hub import create_repo, upload_folder
|
||||
from packaging import version
|
||||
from peft import LoraConfig, set_peft_model_state_dict
|
||||
from peft.utils import get_peft_model_state_dict
|
||||
@@ -57,8 +55,6 @@ from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDPMScheduler,
|
||||
DPMSolverMultistepScheduler,
|
||||
EDMEulerScheduler,
|
||||
EulerDiscreteScheduler,
|
||||
StableDiffusionXLPipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
@@ -78,28 +74,13 @@ from diffusers.utils.torch_utils import is_compiled_module
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.27.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def determine_scheduler_type(pretrained_model_name_or_path, revision):
|
||||
model_index_filename = "model_index.json"
|
||||
if os.path.isdir(pretrained_model_name_or_path):
|
||||
model_index = os.path.join(pretrained_model_name_or_path, model_index_filename)
|
||||
else:
|
||||
model_index = hf_hub_download(
|
||||
repo_id=pretrained_model_name_or_path, filename=model_index_filename, revision=revision
|
||||
)
|
||||
|
||||
with open(model_index, "r") as f:
|
||||
scheduler_type = json.load(f)["scheduler"][1]
|
||||
return scheduler_type
|
||||
|
||||
|
||||
def save_model_card(
|
||||
repo_id: str,
|
||||
use_dora: bool,
|
||||
images=None,
|
||||
base_model=str,
|
||||
train_text_encoder=False,
|
||||
@@ -111,7 +92,6 @@ def save_model_card(
|
||||
vae_path=None,
|
||||
):
|
||||
img_str = "widget:\n"
|
||||
lora = "lora" if not use_dora else "dora"
|
||||
for i, image in enumerate(images):
|
||||
image.save(os.path.join(repo_folder, f"image_{i}.png"))
|
||||
img_str += f"""
|
||||
@@ -164,10 +144,9 @@ to trigger concept `{key}` → use `{tokens}` in your prompt \n
|
||||
tags:
|
||||
- stable-diffusion-xl
|
||||
- stable-diffusion-xl-diffusers
|
||||
- diffusers-training
|
||||
- text-to-image
|
||||
- diffusers
|
||||
- {lora}
|
||||
- lora
|
||||
- template:sd-lora
|
||||
{img_str}
|
||||
base_model: {base_model}
|
||||
@@ -388,11 +367,6 @@ def parse_args(input_args=None):
|
||||
" `args.validation_prompt` multiple times: `args.num_validation_images`."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_edm_style_training",
|
||||
action="store_true",
|
||||
help="Flag to conduct training using the EDM formulation as introduced in https://arxiv.org/abs/2206.00364.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_prior_preservation",
|
||||
default=False,
|
||||
@@ -687,15 +661,6 @@ def parse_args(input_args=None):
|
||||
default=4,
|
||||
help=("The dimension of the LoRA update matrices."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_dora",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help=(
|
||||
"Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
|
||||
"Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_latents",
|
||||
action="store_true",
|
||||
@@ -974,32 +939,6 @@ class DreamBoothDataset(Dataset):
|
||||
self.class_data_root = Path(class_data_root)
|
||||
self.class_data_root.mkdir(parents=True, exist_ok=True)
|
||||
self.class_images_path = list(self.class_data_root.iterdir())
|
||||
|
||||
self.original_sizes_class_imgs = []
|
||||
self.crop_top_lefts_class_imgs = []
|
||||
self.pixel_values_class_imgs = []
|
||||
self.class_images = [Image.open(path) for path in self.class_images_path]
|
||||
for image in self.class_images:
|
||||
image = exif_transpose(image)
|
||||
if not image.mode == "RGB":
|
||||
image = image.convert("RGB")
|
||||
self.original_sizes_class_imgs.append((image.height, image.width))
|
||||
image = train_resize(image)
|
||||
if args.random_flip and random.random() < 0.5:
|
||||
# flip
|
||||
image = train_flip(image)
|
||||
if args.center_crop:
|
||||
y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
|
||||
x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
|
||||
image = train_crop(image)
|
||||
else:
|
||||
y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
|
||||
image = crop(image, y1, x1, h, w)
|
||||
crop_top_left = (y1, x1)
|
||||
self.crop_top_lefts_class_imgs.append(crop_top_left)
|
||||
image = train_transforms(image)
|
||||
self.pixel_values_class_imgs.append(image)
|
||||
|
||||
if class_num is not None:
|
||||
self.num_class_images = min(len(self.class_images_path), class_num)
|
||||
else:
|
||||
@@ -1022,9 +961,12 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
def __getitem__(self, index):
|
||||
example = {}
|
||||
example["instance_images"] = self.pixel_values[index % self.num_instance_images]
|
||||
example["original_size"] = self.original_sizes[index % self.num_instance_images]
|
||||
example["crop_top_left"] = self.crop_top_lefts[index % self.num_instance_images]
|
||||
instance_image = self.pixel_values[index % self.num_instance_images]
|
||||
original_size = self.original_sizes[index % self.num_instance_images]
|
||||
crop_top_left = self.crop_top_lefts[index % self.num_instance_images]
|
||||
example["instance_images"] = instance_image
|
||||
example["original_size"] = original_size
|
||||
example["crop_top_left"] = crop_top_left
|
||||
|
||||
if self.custom_instance_prompts:
|
||||
caption = self.custom_instance_prompts[index % self.num_instance_images]
|
||||
@@ -1041,10 +983,13 @@ class DreamBoothDataset(Dataset):
|
||||
example["instance_prompt"] = self.instance_prompt
|
||||
|
||||
if self.class_data_root:
|
||||
class_image = Image.open(self.class_images_path[index % self.num_class_images])
|
||||
class_image = exif_transpose(class_image)
|
||||
|
||||
if not class_image.mode == "RGB":
|
||||
class_image = class_image.convert("RGB")
|
||||
example["class_images"] = self.image_transforms(class_image)
|
||||
example["class_prompt"] = self.class_prompt
|
||||
example["class_images"] = self.pixel_values_class_imgs[index % self.num_class_images]
|
||||
example["class_original_size"] = self.original_sizes_class_imgs[index % self.num_class_images]
|
||||
example["class_crop_top_left"] = self.crop_top_lefts_class_imgs[index % self.num_class_images]
|
||||
|
||||
return example
|
||||
|
||||
@@ -1060,8 +1005,6 @@ def collate_fn(examples, with_prior_preservation=False):
|
||||
if with_prior_preservation:
|
||||
pixel_values += [example["class_images"] for example in examples]
|
||||
prompts += [example["class_prompt"] for example in examples]
|
||||
original_sizes += [example["class_original_size"] for example in examples]
|
||||
crop_top_lefts += [example["class_crop_top_left"] for example in examples]
|
||||
|
||||
pixel_values = torch.stack(pixel_values)
|
||||
pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
|
||||
@@ -1140,8 +1083,6 @@ def main(args):
|
||||
"You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
|
||||
" Please use `huggingface-cli login` to authenticate with the Hub."
|
||||
)
|
||||
if args.do_edm_style_training and args.snr_gamma is not None:
|
||||
raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.")
|
||||
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
@@ -1259,19 +1200,7 @@ def main(args):
|
||||
)
|
||||
|
||||
# Load scheduler and models
|
||||
scheduler_type = determine_scheduler_type(args.pretrained_model_name_or_path, args.revision)
|
||||
if "EDM" in scheduler_type:
|
||||
args.do_edm_style_training = True
|
||||
noise_scheduler = EDMEulerScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
||||
logger.info("Performing EDM-style training!")
|
||||
elif args.do_edm_style_training:
|
||||
noise_scheduler = EulerDiscreteScheduler.from_pretrained(
|
||||
args.pretrained_model_name_or_path, subfolder="scheduler"
|
||||
)
|
||||
logger.info("Performing EDM-style training!")
|
||||
else:
|
||||
noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
||||
|
||||
noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
||||
text_encoder_one = text_encoder_cls_one.from_pretrained(
|
||||
args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
|
||||
)
|
||||
@@ -1289,12 +1218,7 @@ def main(args):
|
||||
revision=args.revision,
|
||||
variant=args.variant,
|
||||
)
|
||||
latents_mean = latents_std = None
|
||||
if hasattr(vae.config, "latents_mean") and vae.config.latents_mean is not None:
|
||||
latents_mean = torch.tensor(vae.config.latents_mean).view(1, 4, 1, 1)
|
||||
if hasattr(vae.config, "latents_std") and vae.config.latents_std is not None:
|
||||
latents_std = torch.tensor(vae.config.latents_std).view(1, 4, 1, 1)
|
||||
|
||||
vae_scaling_factor = vae.config.scaling_factor
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
|
||||
)
|
||||
@@ -1359,7 +1283,7 @@ def main(args):
|
||||
|
||||
xformers_version = version.parse(xformers.__version__)
|
||||
if xformers_version == version.parse("0.0.16"):
|
||||
logger.warning(
|
||||
logger.warn(
|
||||
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, "
|
||||
"please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
||||
)
|
||||
@@ -1377,7 +1301,6 @@ def main(args):
|
||||
unet_lora_config = LoraConfig(
|
||||
r=args.rank,
|
||||
lora_alpha=args.rank,
|
||||
use_dora=args.use_dora,
|
||||
init_lora_weights="gaussian",
|
||||
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
|
||||
)
|
||||
@@ -1389,7 +1312,6 @@ def main(args):
|
||||
text_lora_config = LoraConfig(
|
||||
r=args.rank,
|
||||
lora_alpha=args.rank,
|
||||
use_dora=args.use_dora,
|
||||
init_lora_weights="gaussian",
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
|
||||
)
|
||||
@@ -1564,14 +1486,14 @@ def main(args):
|
||||
|
||||
# Optimizer creation
|
||||
if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
|
||||
logger.warning(
|
||||
logger.warn(
|
||||
f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
|
||||
"Defaulting to adamW"
|
||||
)
|
||||
args.optimizer = "adamw"
|
||||
|
||||
if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
|
||||
logger.warning(
|
||||
logger.warn(
|
||||
f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
|
||||
f"set to {args.optimizer.lower()}"
|
||||
)
|
||||
@@ -1605,11 +1527,11 @@ def main(args):
|
||||
optimizer_class = prodigyopt.Prodigy
|
||||
|
||||
if args.learning_rate <= 0.1:
|
||||
logger.warning(
|
||||
logger.warn(
|
||||
"Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
|
||||
)
|
||||
if args.train_text_encoder and args.text_encoder_lr:
|
||||
logger.warning(
|
||||
logger.warn(
|
||||
f"Learning rates were provided both for the unet and the text encoder- e.g. text_encoder_lr:"
|
||||
f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
|
||||
f"When using prodigy only learning_rate is used as the initial learning rate."
|
||||
@@ -1832,19 +1754,6 @@ def main(args):
|
||||
disable=not accelerator.is_local_main_process,
|
||||
)
|
||||
|
||||
def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
|
||||
# TODO: revisit other sampling algorithms
|
||||
sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
|
||||
schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
|
||||
timesteps = timesteps.to(accelerator.device)
|
||||
|
||||
step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
|
||||
|
||||
sigma = sigmas[step_indices].flatten()
|
||||
while len(sigma.shape) < n_dim:
|
||||
sigma = sigma.unsqueeze(-1)
|
||||
return sigma
|
||||
|
||||
if args.train_text_encoder:
|
||||
num_train_epochs_text_encoder = int(args.train_text_encoder_frac * args.num_train_epochs)
|
||||
elif args.train_text_encoder_ti: # args.train_text_encoder_ti
|
||||
@@ -1896,15 +1805,9 @@ def main(args):
|
||||
pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
|
||||
model_input = vae.encode(pixel_values).latent_dist.sample()
|
||||
|
||||
if latents_mean is None and latents_std is None:
|
||||
model_input = model_input * vae.config.scaling_factor
|
||||
if args.pretrained_vae_model_name_or_path is None:
|
||||
model_input = model_input.to(weight_dtype)
|
||||
else:
|
||||
latents_mean = latents_mean.to(device=model_input.device, dtype=model_input.dtype)
|
||||
latents_std = latents_std.to(device=model_input.device, dtype=model_input.dtype)
|
||||
model_input = (model_input - latents_mean) * vae.config.scaling_factor / latents_std
|
||||
model_input = model_input.to(dtype=weight_dtype)
|
||||
model_input = model_input * vae_scaling_factor
|
||||
if args.pretrained_vae_model_name_or_path is None:
|
||||
model_input = model_input.to(weight_dtype)
|
||||
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn_like(model_input)
|
||||
@@ -1915,32 +1818,15 @@ def main(args):
|
||||
)
|
||||
|
||||
bsz = model_input.shape[0]
|
||||
|
||||
# Sample a random timestep for each image
|
||||
if not args.do_edm_style_training:
|
||||
timesteps = torch.randint(
|
||||
0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
|
||||
)
|
||||
timesteps = timesteps.long()
|
||||
else:
|
||||
# in EDM formulation, the model is conditioned on the pre-conditioned noise levels
|
||||
# instead of discrete timesteps, so here we sample indices to get the noise levels
|
||||
# from `scheduler.timesteps`
|
||||
indices = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,))
|
||||
timesteps = noise_scheduler.timesteps[indices].to(device=model_input.device)
|
||||
timesteps = torch.randint(
|
||||
0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
|
||||
)
|
||||
timesteps = timesteps.long()
|
||||
|
||||
# Add noise to the model input according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
|
||||
# For EDM-style training, we first obtain the sigmas based on the continuous timesteps.
|
||||
# We then precondition the final model inputs based on these sigmas instead of the timesteps.
|
||||
# Follow: Section 5 of https://arxiv.org/abs/2206.00364.
|
||||
if args.do_edm_style_training:
|
||||
sigmas = get_sigmas(timesteps, len(noisy_model_input.shape), noisy_model_input.dtype)
|
||||
if "EDM" in scheduler_type:
|
||||
inp_noisy_latents = noise_scheduler.precondition_inputs(noisy_model_input, sigmas)
|
||||
else:
|
||||
inp_noisy_latents = noisy_model_input / ((sigmas**2 + 1) ** 0.5)
|
||||
|
||||
# time ids
|
||||
add_time_ids = torch.cat(
|
||||
@@ -1966,7 +1852,7 @@ def main(args):
|
||||
}
|
||||
prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
|
||||
model_pred = unet(
|
||||
inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
|
||||
noisy_model_input,
|
||||
timesteps,
|
||||
prompt_embeds_input,
|
||||
added_cond_kwargs=unet_added_conditions,
|
||||
@@ -1984,42 +1870,14 @@ def main(args):
|
||||
)
|
||||
prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
|
||||
model_pred = unet(
|
||||
inp_noisy_latents if args.do_edm_style_training else noisy_model_input,
|
||||
timesteps,
|
||||
prompt_embeds_input,
|
||||
added_cond_kwargs=unet_added_conditions,
|
||||
noisy_model_input, timesteps, prompt_embeds_input, added_cond_kwargs=unet_added_conditions
|
||||
).sample
|
||||
|
||||
weighting = None
|
||||
if args.do_edm_style_training:
|
||||
# Similar to the input preconditioning, the model predictions are also preconditioned
|
||||
# on noised model inputs (before preconditioning) and the sigmas.
|
||||
# Follow: Section 5 of https://arxiv.org/abs/2206.00364.
|
||||
if "EDM" in scheduler_type:
|
||||
model_pred = noise_scheduler.precondition_outputs(noisy_model_input, model_pred, sigmas)
|
||||
else:
|
||||
if noise_scheduler.config.prediction_type == "epsilon":
|
||||
model_pred = model_pred * (-sigmas) + noisy_model_input
|
||||
elif noise_scheduler.config.prediction_type == "v_prediction":
|
||||
model_pred = model_pred * (-sigmas / (sigmas**2 + 1) ** 0.5) + (
|
||||
noisy_model_input / (sigmas**2 + 1)
|
||||
)
|
||||
# We are not doing weighting here because it tends result in numerical problems.
|
||||
# See: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051
|
||||
# There might be other alternatives for weighting as well:
|
||||
# https://github.com/huggingface/diffusers/pull/7126#discussion_r1505404686
|
||||
if "EDM" not in scheduler_type:
|
||||
weighting = (sigmas**-2.0).float()
|
||||
|
||||
# Get the target for loss depending on the prediction type
|
||||
if noise_scheduler.config.prediction_type == "epsilon":
|
||||
target = model_input if args.do_edm_style_training else noise
|
||||
target = noise
|
||||
elif noise_scheduler.config.prediction_type == "v_prediction":
|
||||
target = (
|
||||
model_input
|
||||
if args.do_edm_style_training
|
||||
else noise_scheduler.get_velocity(model_input, noise, timesteps)
|
||||
)
|
||||
target = noise_scheduler.get_velocity(model_input, noise, timesteps)
|
||||
else:
|
||||
raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
|
||||
|
||||
@@ -2029,28 +1887,10 @@ def main(args):
|
||||
target, target_prior = torch.chunk(target, 2, dim=0)
|
||||
|
||||
# Compute prior loss
|
||||
if weighting is not None:
|
||||
prior_loss = torch.mean(
|
||||
(weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
|
||||
target_prior.shape[0], -1
|
||||
),
|
||||
1,
|
||||
)
|
||||
prior_loss = prior_loss.mean()
|
||||
else:
|
||||
prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
|
||||
prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
|
||||
|
||||
if args.snr_gamma is None:
|
||||
if weighting is not None:
|
||||
loss = torch.mean(
|
||||
(weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(
|
||||
target.shape[0], -1
|
||||
),
|
||||
1,
|
||||
)
|
||||
loss = loss.mean()
|
||||
else:
|
||||
loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
|
||||
loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
|
||||
else:
|
||||
# Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
|
||||
# Since we predict the noise instead of x_0, the original formulation is slightly changed.
|
||||
@@ -2173,18 +2013,17 @@ def main(args):
|
||||
# We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
|
||||
scheduler_args = {}
|
||||
|
||||
if not args.do_edm_style_training:
|
||||
if "variance_type" in pipeline.scheduler.config:
|
||||
variance_type = pipeline.scheduler.config.variance_type
|
||||
if "variance_type" in pipeline.scheduler.config:
|
||||
variance_type = pipeline.scheduler.config.variance_type
|
||||
|
||||
if variance_type in ["learned", "learned_range"]:
|
||||
variance_type = "fixed_small"
|
||||
if variance_type in ["learned", "learned_range"]:
|
||||
variance_type = "fixed_small"
|
||||
|
||||
scheduler_args["variance_type"] = variance_type
|
||||
scheduler_args["variance_type"] = variance_type
|
||||
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
|
||||
pipeline.scheduler.config, **scheduler_args
|
||||
)
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
|
||||
pipeline.scheduler.config, **scheduler_args
|
||||
)
|
||||
|
||||
pipeline = pipeline.to(accelerator.device)
|
||||
pipeline.set_progress_bar_config(disable=True)
|
||||
@@ -2192,13 +2031,8 @@ def main(args):
|
||||
# run inference
|
||||
generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
|
||||
pipeline_args = {"prompt": args.validation_prompt}
|
||||
inference_ctx = (
|
||||
contextlib.nullcontext()
|
||||
if "playground" in args.pretrained_model_name_or_path
|
||||
else torch.cuda.amp.autocast()
|
||||
)
|
||||
|
||||
with inference_ctx:
|
||||
with torch.cuda.amp.autocast():
|
||||
images = [
|
||||
pipeline(**pipeline_args, generator=generator).images[0]
|
||||
for _ in range(args.num_validation_images)
|
||||
@@ -2274,18 +2108,15 @@ def main(args):
|
||||
# We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
|
||||
scheduler_args = {}
|
||||
|
||||
if not args.do_edm_style_training:
|
||||
if "variance_type" in pipeline.scheduler.config:
|
||||
variance_type = pipeline.scheduler.config.variance_type
|
||||
if "variance_type" in pipeline.scheduler.config:
|
||||
variance_type = pipeline.scheduler.config.variance_type
|
||||
|
||||
if variance_type in ["learned", "learned_range"]:
|
||||
variance_type = "fixed_small"
|
||||
if variance_type in ["learned", "learned_range"]:
|
||||
variance_type = "fixed_small"
|
||||
|
||||
scheduler_args["variance_type"] = variance_type
|
||||
scheduler_args["variance_type"] = variance_type
|
||||
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
|
||||
pipeline.scheduler.config, **scheduler_args
|
||||
)
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
|
||||
|
||||
# load attention processors
|
||||
pipeline.load_lora_weights(args.output_dir)
|
||||
@@ -2339,7 +2170,6 @@ def main(args):
|
||||
|
||||
save_model_card(
|
||||
model_id if not args.push_to_hub else repo_id,
|
||||
use_dora=args.use_dora,
|
||||
images=images,
|
||||
base_model=args.pretrained_model_name_or_path,
|
||||
train_text_encoder=args.train_text_encoder,
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
# Community Pipeline Examples
|
||||
# Community Examples
|
||||
|
||||
> **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).**
|
||||
|
||||
**Community pipeline** examples consist pipelines that have been added by the community.
|
||||
Please have a look at the following tables to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
|
||||
If a community pipeline doesn't work as expected, please open an issue and ping the author on it.
|
||||
|
||||
Please also check out our [Community Scripts](https://github.com/huggingface/diffusers/blob/main/examples/community/README_community_scripts.md) examples for tips and tricks that you can use with diffusers without having to run a community pipeline.
|
||||
**Community** examples consist of both inference and training examples that have been added by the community.
|
||||
Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
|
||||
If a community doesn't work as expected, please open an issue and ping the author on it.
|
||||
|
||||
| Example | Description | Code Example | Colab | Author |
|
||||
|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
|
||||
@@ -59,13 +57,12 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
|
||||
| DemoFusion Pipeline | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973) | [DemoFusion Pipeline](#DemoFusion) | - | [Ruoyi Du](https://github.com/RuoyiDu) |
|
||||
| Instaflow Pipeline | Implementation of [InstaFlow! One-Step Stable Diffusion with Rectified Flow](https://arxiv.org/abs/2309.06380) | [Instaflow Pipeline](#instaflow-pipeline) | - | [Ayush Mangal](https://github.com/ayushtues) |
|
||||
| Null-Text Inversion Pipeline | Implement [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/abs/2211.09794) as a pipeline. | [Null-Text Inversion](https://github.com/google/prompt-to-prompt/) | - | [Junsheng Luan](https://github.com/Junsheng121) |
|
||||
| Rerender A Video Pipeline | Implementation of [[SIGGRAPH Asia 2023] Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation](https://arxiv.org/abs/2306.07954) | [Rerender A Video Pipeline](#Rerender-A-Video) | - | [Yifan Zhou](https://github.com/SingleZombie) |
|
||||
| Rerender A Video Pipeline | Implementation of [[SIGGRAPH Asia 2023] Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation](https://arxiv.org/abs/2306.07954) | [Rerender A Video Pipeline](#Rerender_A_Video) | - | [Yifan Zhou](https://github.com/SingleZombie) |
|
||||
| StyleAligned Pipeline | Implementation of [Style Aligned Image Generation via Shared Attention](https://arxiv.org/abs/2312.02133) | [StyleAligned Pipeline](#stylealigned-pipeline) | [](https://drive.google.com/file/d/15X2E0jFPTajUIjS0FzX50OaHsCbP2lQ0/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
|
||||
| AnimateDiff Image-To-Video Pipeline | Experimental Image-To-Video support for AnimateDiff (open to improvements) | [AnimateDiff Image To Video Pipeline](#animatediff-image-to-video-pipeline) | [](https://drive.google.com/file/d/1TvzCDPHhfFtdcJZe4RLloAwyoLKuttWK/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
|
||||
| IP Adapter FaceID Stable Diffusion | Stable Diffusion Pipeline that supports IP Adapter Face ID | [IP Adapter Face ID](#ip-adapter-face-id) | - | [Fabio Rigano](https://github.com/fabiorigano) |
|
||||
| InstantID Pipeline | Stable Diffusion XL Pipeline that supports InstantID | [InstantID Pipeline](#instantid-pipeline) | [](https://huggingface.co/spaces/InstantX/InstantID) | [Haofan Wang](https://github.com/haofanwang) |
|
||||
| UFOGen Scheduler | Scheduler for UFOGen Model (compatible with Stable Diffusion pipelines) | [UFOGen Scheduler](#ufogen-scheduler) | - | [dg845](https://github.com/dg845) |
|
||||
| Stable Diffusion XL IPEX Pipeline | Accelerate Stable Diffusion XL inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion XL on IPEX](#stable-diffusion-xl-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |
|
||||
|
||||
To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
|
||||
|
||||
@@ -107,7 +104,7 @@ pipeline_output = pipe(
|
||||
# processing_res=768, # (optional) Maximum resolution of processing. If set to 0: will not resize at all. Defaults to 768.
|
||||
# match_input_res=True, # (optional) Resize depth prediction to match input resolution.
|
||||
# batch_size=0, # (optional) Inference batch size, no bigger than `num_ensemble`. If set to 0, the script will automatically decide the proper batch size. Defaults to 0.
|
||||
# color_map="Spectral", # (optional) Colormap used to colorize the depth map. Defaults to "Spectral". Set to `None` to skip colormap generation.
|
||||
# color_map="Spectral", # (optional) Colormap used to colorize the depth map. Defaults to "Spectral".
|
||||
# show_progress_bar=True, # (optional) If true, will show progress bars of the inference progress.
|
||||
)
|
||||
|
||||
@@ -752,7 +749,7 @@ This example produces the following images:
|
||||

|
||||
|
||||
### GlueGen Stable Diffusion Pipeline
|
||||
GlueGen is a minimal adapter that allow alignment between any encoder (Text Encoder of different language, Multilingual Roberta, AudioClip) and CLIP text encoder used in standard Stable Diffusion model. This method allows easy language adaptation to available english Stable Diffusion checkpoints without the need of an image captioning dataset as well as long training hours.
|
||||
GlueGen is a minimal adapter that allow alignment between any encoder (Text Encoder of different language, Multilingual Roberta, AudioClip) and CLIP text encoder used in standard Stable Diffusion model. This method allows easy language adaptation to available english Stable Diffusion checkpoints without the need of an image captioning dataset as well as long training hours.
|
||||
|
||||
Make sure you downloaded `gluenet_French_clip_overnorm_over3_noln.ckpt` for French (there are also pre-trained weights for Chinese, Italian, Japanese, Spanish or train your own) at [GlueGen's official repo](https://github.com/salesforce/GlueGen/tree/main)
|
||||
|
||||
@@ -784,9 +781,9 @@ if __name__ == "__main__":
|
||||
).to(device)
|
||||
pipeline.load_language_adapter("gluenet_French_clip_overnorm_over3_noln.ckpt", num_token=token_max_length, dim=1024, dim_out=768, tensor_norm=tensor_norm)
|
||||
|
||||
prompt = "une voiture sur la plage"
|
||||
prompt = "une voiture sur la plage"
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(42)
|
||||
generator = torch.Generator(device=device).manual_seed(42)
|
||||
image = pipeline(prompt, generator=generator).images[0]
|
||||
image.save("gluegen_output_fr.png")
|
||||
```
|
||||
@@ -1710,111 +1707,6 @@ print("Latency of StableDiffusionPipeline--fp32",latency)
|
||||
|
||||
```
|
||||
|
||||
### Stable Diffusion XL on IPEX
|
||||
|
||||
This diffusion pipeline aims to accelarate the inference of Stable-Diffusion XL on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch).
|
||||
|
||||
To use this pipeline, you need to:
|
||||
1. Install [IPEX](https://github.com/intel/intel-extension-for-pytorch)
|
||||
|
||||
**Note:** For each PyTorch release, there is a corresponding release of IPEX. Here is the mapping relationship. It is recommended to install Pytorch/IPEX2.0 to get the best performance.
|
||||
|
||||
|PyTorch Version|IPEX Version|
|
||||
|--|--|
|
||||
|[v2.0.\*](https://github.com/pytorch/pytorch/tree/v2.0.1 "v2.0.1")|[v2.0.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v2.0.100+cpu)|
|
||||
|[v1.13.\*](https://github.com/pytorch/pytorch/tree/v1.13.0 "v1.13.0")|[v1.13.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v1.13.100+cpu)|
|
||||
|
||||
You can simply use pip to install IPEX with the latest version.
|
||||
```python
|
||||
python -m pip install intel_extension_for_pytorch
|
||||
```
|
||||
**Note:** To install a specific version, run with the following command:
|
||||
```
|
||||
python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
|
||||
```
|
||||
|
||||
2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16.
|
||||
|
||||
**Note:** The values of `height` and `width` used during preparation with `prepare_for_ipex()` should be the same when running inference with the prepared pipeline.
|
||||
|
||||
```python
|
||||
pipe = StableDiffusionXLPipelineIpex.from_pretrained("stabilityai/sdxl-turbo", low_cpu_mem_usage=True, use_safetensors=True)
|
||||
# value of image height/width should be consistent with the pipeline inference
|
||||
# For Float32
|
||||
pipe.prepare_for_ipex(torch.float32, prompt, height=512, width=512)
|
||||
# For BFloat16
|
||||
pipe.prepare_for_ipex(torch.bfloat16, prompt, height=512, width=512)
|
||||
```
|
||||
|
||||
Then you can use the ipex pipeline in a similar way to the default stable diffusion xl pipeline.
|
||||
```python
|
||||
# value of image height/width should be consistent with 'prepare_for_ipex()'
|
||||
# For Float32
|
||||
image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=guidance_scale).images[0]
|
||||
# For BFloat16
|
||||
with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
|
||||
image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=guidance_scale).images[0]
|
||||
```
|
||||
|
||||
The following code compares the performance of the original stable diffusion xl pipeline with the ipex-optimized pipeline.
|
||||
By using this optimized pipeline, we can get about 1.4-2 times performance boost with BFloat16 on fourth generation of Intel Xeon CPUs,
|
||||
code-named Sapphire Rapids.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from pipeline_stable_diffusion_xl_ipex import StableDiffusionXLPipelineIpex
|
||||
import time
|
||||
|
||||
prompt = "sailing ship in storm by Rembrandt"
|
||||
model_id = "stabilityai/sdxl-turbo"
|
||||
steps = 4
|
||||
|
||||
# Helper function for time evaluation
|
||||
def elapsed_time(pipeline, nb_pass=3, num_inference_steps=1):
|
||||
# warmup
|
||||
for _ in range(2):
|
||||
images = pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=0.0).images
|
||||
#time evaluation
|
||||
start = time.time()
|
||||
for _ in range(nb_pass):
|
||||
pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=0.0)
|
||||
end = time.time()
|
||||
return (end - start) / nb_pass
|
||||
|
||||
############## bf16 inference performance ###############
|
||||
|
||||
# 1. IPEX Pipeline initialization
|
||||
pipe = StableDiffusionXLPipelineIpex.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
|
||||
pipe.prepare_for_ipex(torch.bfloat16, prompt, height=512, width=512)
|
||||
|
||||
# 2. Original Pipeline initialization
|
||||
pipe2 = StableDiffusionXLPipeline.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
|
||||
|
||||
# 3. Compare performance between Original Pipeline and IPEX Pipeline
|
||||
with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
|
||||
latency = elapsed_time(pipe, num_inference_steps=steps)
|
||||
print("Latency of StableDiffusionXLPipelineIpex--bf16", latency, "s for total", steps, "steps")
|
||||
latency = elapsed_time(pipe2, num_inference_steps=steps)
|
||||
print("Latency of StableDiffusionXLPipeline--bf16", latency, "s for total", steps, "steps")
|
||||
|
||||
############## fp32 inference performance ###############
|
||||
|
||||
# 1. IPEX Pipeline initialization
|
||||
pipe3 = StableDiffusionXLPipelineIpex.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
|
||||
pipe3.prepare_for_ipex(torch.float32, prompt, height=512, width=512)
|
||||
|
||||
# 2. Original Pipeline initialization
|
||||
pipe4 = StableDiffusionXLPipeline.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
|
||||
|
||||
# 3. Compare performance between Original Pipeline and IPEX Pipeline
|
||||
latency = elapsed_time(pipe3, num_inference_steps=steps)
|
||||
print("Latency of StableDiffusionXLPipelineIpex--fp32", latency, "s for total", steps, "steps")
|
||||
latency = elapsed_time(pipe4, num_inference_steps=steps)
|
||||
print("Latency of StableDiffusionXLPipeline--fp32",latency, "s for total", steps, "steps")
|
||||
|
||||
```
|
||||
|
||||
### CLIP Guided Images Mixing With Stable Diffusion
|
||||
|
||||

|
||||
@@ -1828,7 +1720,7 @@ This approach is using (optional) CoCa model to avoid writing image description.
|
||||
|
||||
This SDXL pipeline support unlimited length prompt and negative prompt, compatible with A1111 prompt weighted style.
|
||||
|
||||
You can provide both `prompt` and `prompt_2`. If only one prompt is provided, `prompt_2` will be a copy of the provided `prompt`. Here is a sample code to use this pipeline.
|
||||
You can provide both `prompt` and `prompt_2`. If only one prompt is provided, `prompt_2` will be a copy of the provided `prompt`. Here is a sample code to use this pipeline.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
@@ -1889,7 +1781,7 @@ In the above code, the `prompt2` is appended to the `prompt`, which is more than
|
||||
|
||||
For more results, checkout [PR #6114](https://github.com/huggingface/diffusers/pull/6114).
|
||||
|
||||
### Example Images Mixing (with CoCa)
|
||||
## Example Images Mixing (with CoCa)
|
||||
```python
|
||||
import requests
|
||||
from io import BytesIO
|
||||
@@ -2395,9 +2287,9 @@ Here's a full example for `ReplaceEdit``:
|
||||
import torch
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.pipelines import Prompt2PromptPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="pipeline_prompt2prompt").to("cuda")
|
||||
pipe = Prompt2PromptPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to("cuda")
|
||||
|
||||
prompts = ["A turtle playing with a ball",
|
||||
"A monkey playing with a ball"]
|
||||
@@ -2936,7 +2828,7 @@ pipe(prompt =prompt, rp_args = rp_args)
|
||||
|
||||
The Pipeline supports `compel` syntax. Input prompts using the `compel` structure will be automatically applied and processed.
|
||||
|
||||
### Diffusion Posterior Sampling Pipeline
|
||||
## Diffusion Posterior Sampling Pipeline
|
||||
* Reference paper
|
||||
```
|
||||
@article{chung2022diffusion,
|
||||
@@ -3399,7 +3291,7 @@ invert_prompt = "A lying cat"
|
||||
input_image = "siamese.jpg"
|
||||
steps = 50
|
||||
|
||||
# Provide prompt used for generation. Same if reconstruction
|
||||
# Provide prompt used for generation. Same if reconstruction
|
||||
prompt = "A lying cat"
|
||||
# or different if editing.
|
||||
prompt = "A lying dog"
|
||||
@@ -3414,15 +3306,18 @@ inverted_latent, uncond = pipeline.invert(input_image, invert_prompt, num_inner_
|
||||
pipeline(prompt, uncond, inverted_latent, guidance_scale=7.5, num_inference_steps=steps).images[0].save(input_image+".output.jpg")
|
||||
```
|
||||
|
||||
### Rerender A Video
|
||||
### Rerender_A_Video
|
||||
|
||||
This is the Diffusers implementation of zero-shot video-to-video translation pipeline [Rerender A Video](https://github.com/williamyang1991/Rerender_A_Video) (without Ebsynth postprocessing). To run the code, please install gmflow. Then modify the path in `gmflow_dir`. After that, you can run the pipeline with:
|
||||
```
|
||||
This is the Diffusers implementation of zero-shot video-to-video translation pipeline [Rerender_A_Video](https://github.com/williamyang1991/Rerender_A_Video) (without Ebsynth postprocessing). To run the code, please install gmflow. Then modify the path in `examples/community/rerender_a_video.py`:
|
||||
|
||||
```py
|
||||
import sys
|
||||
gmflow_dir = "/path/to/gmflow"
|
||||
sys.path.insert(0, gmflow_dir)
|
||||
```
|
||||
|
||||
After that, you can run the pipeline with:
|
||||
|
||||
```py
|
||||
from diffusers import ControlNetModel, AutoencoderKL, DDIMScheduler
|
||||
from diffusers.utils import export_to_video
|
||||
import numpy as np
|
||||
@@ -3493,7 +3388,7 @@ output_frames = pipe(
|
||||
mask_end=0.8,
|
||||
mask_strength=0.5,
|
||||
negative_prompt='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
|
||||
).frames[0]
|
||||
).frames
|
||||
|
||||
export_to_video(
|
||||
output_frames, "/path/to/video.mp4", 5)
|
||||
@@ -3561,17 +3456,14 @@ pipe.disable_style_aligned()
|
||||
|
||||
This pipeline adds experimental support for the image-to-video task using AnimateDiff. Refer to [this](https://github.com/huggingface/diffusers/pull/6328) PR for more examples and results.
|
||||
|
||||
This pipeline relies on a "hack" discovered by the community that allows the generation of videos given an input image with AnimateDiff. It works by creating a copy of the image `num_frames` times and progressively adding more noise to the image based on the strength and latent interpolation method.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
|
||||
from diffusers.utils import export_to_gif, load_image
|
||||
|
||||
model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
|
||||
pipe = DiffusionPipeline.from_pretrained(model_id, motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
|
||||
pipe.scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", beta_schedule="linear", steps_offset=1)
|
||||
pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
|
||||
pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
|
||||
|
||||
image = load_image("snail.png")
|
||||
output = pipe(
|
||||
@@ -3636,8 +3528,8 @@ image = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
|
||||
images = pipeline(
|
||||
prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
|
||||
image_embeds=image,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=20, num_images_per_prompt=num_images, width=512, height=704,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=20, num_images_per_prompt=num_images, width=512, height=704,
|
||||
generator=generator
|
||||
).images
|
||||
|
||||
|
||||
@@ -1,232 +0,0 @@
|
||||
# Community Scripts
|
||||
|
||||
**Community scripts** consist of inference examples using Diffusers pipelines that have been added by the community.
|
||||
Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste code example that you can try out.
|
||||
If a community script doesn't work as expected, please open an issue and ping the author on it.
|
||||
|
||||
| Example | Description | Code Example | Colab | Author |
|
||||
|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
|
||||
| Using IP-Adapter with negative noise | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details) | [IP-Adapter Negative Noise](#ip-adapter-negative-noise) | | [Álvaro Somoza](https://github.com/asomoza)|
|
||||
| asymmetric tiling |configure seamless image tiling independently for the X and Y axes | [Asymmetric Tiling](#asymmetric-tiling ) | | [alexisrolland](https://github.com/alexisrolland)|
|
||||
|
||||
|
||||
## Example usages
|
||||
|
||||
### IP Adapter Negative Noise
|
||||
|
||||
Diffusers pipelines are fully integrated with IP-Adapter, which allows you to prompt the diffusion model with an image. However, it does not support negative image prompts (there is no `negative_ip_adapter_image` argument) the same way it supports negative text prompts. When you pass an `ip_adapter_image,` it will create a zero-filled tensor as a negative image. This script shows you how to create a negative noise from `ip_adapter_image` and use it to significantly improve the generation quality while preserving the composition of images.
|
||||
|
||||
[cubiq](https://github.com/cubiq) initially developed this feature in his [repository](https://github.com/cubiq/ComfyUI_IPAdapter_plus). The community script was contributed by [asomoza](https://github.com/Somoza). You can find more details about this experimentation [this discussion](https://github.com/huggingface/diffusers/discussions/7167)
|
||||
|
||||
IP-Adapter without negative noise
|
||||
|source|result|
|
||||
|---|---|
|
||||
|||
|
||||
|
||||
IP-Adapter with negative noise
|
||||
|source|result|
|
||||
|---|---|
|
||||
|||
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, StableDiffusionXLPipeline
|
||||
from diffusers.models import ImageProjection
|
||||
from diffusers.utils import load_image
|
||||
|
||||
|
||||
def encode_image(
|
||||
image_encoder,
|
||||
feature_extractor,
|
||||
image,
|
||||
device,
|
||||
num_images_per_prompt,
|
||||
output_hidden_states=None,
|
||||
negative_image=None,
|
||||
):
|
||||
dtype = next(image_encoder.parameters()).dtype
|
||||
|
||||
if not isinstance(image, torch.Tensor):
|
||||
image = feature_extractor(image, return_tensors="pt").pixel_values
|
||||
|
||||
image = image.to(device=device, dtype=dtype)
|
||||
if output_hidden_states:
|
||||
image_enc_hidden_states = image_encoder(image, output_hidden_states=True).hidden_states[-2]
|
||||
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
|
||||
if negative_image is None:
|
||||
uncond_image_enc_hidden_states = image_encoder(
|
||||
torch.zeros_like(image), output_hidden_states=True
|
||||
).hidden_states[-2]
|
||||
else:
|
||||
if not isinstance(negative_image, torch.Tensor):
|
||||
negative_image = feature_extractor(negative_image, return_tensors="pt").pixel_values
|
||||
negative_image = negative_image.to(device=device, dtype=dtype)
|
||||
uncond_image_enc_hidden_states = image_encoder(negative_image, output_hidden_states=True).hidden_states[-2]
|
||||
|
||||
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
return image_enc_hidden_states, uncond_image_enc_hidden_states
|
||||
else:
|
||||
image_embeds = image_encoder(image).image_embeds
|
||||
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
||||
uncond_image_embeds = torch.zeros_like(image_embeds)
|
||||
|
||||
return image_embeds, uncond_image_embeds
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def prepare_ip_adapter_image_embeds(
|
||||
unet,
|
||||
image_encoder,
|
||||
feature_extractor,
|
||||
ip_adapter_image,
|
||||
do_classifier_free_guidance,
|
||||
device,
|
||||
num_images_per_prompt,
|
||||
ip_adapter_negative_image=None,
|
||||
):
|
||||
if not isinstance(ip_adapter_image, list):
|
||||
ip_adapter_image = [ip_adapter_image]
|
||||
|
||||
if len(ip_adapter_image) != len(unet.encoder_hid_proj.image_projection_layers):
|
||||
raise ValueError(
|
||||
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
||||
)
|
||||
|
||||
image_embeds = []
|
||||
for single_ip_adapter_image, image_proj_layer in zip(
|
||||
ip_adapter_image, unet.encoder_hid_proj.image_projection_layers
|
||||
):
|
||||
output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
|
||||
single_image_embeds, single_negative_image_embeds = encode_image(
|
||||
image_encoder,
|
||||
feature_extractor,
|
||||
single_ip_adapter_image,
|
||||
device,
|
||||
1,
|
||||
output_hidden_state,
|
||||
negative_image=ip_adapter_negative_image,
|
||||
)
|
||||
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
||||
single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0)
|
||||
|
||||
if do_classifier_free_guidance:
|
||||
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
||||
single_image_embeds = single_image_embeds.to(device)
|
||||
|
||||
image_embeds.append(single_image_embeds)
|
||||
|
||||
return image_embeds
|
||||
|
||||
|
||||
vae = AutoencoderKL.from_pretrained(
|
||||
"madebyollin/sdxl-vae-fp16-fix",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"RunDiffusion/Juggernaut-XL-v9",
|
||||
torch_dtype=torch.float16,
|
||||
vae=vae,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
|
||||
pipeline.scheduler.config.use_karras_sigmas = True
|
||||
|
||||
pipeline.load_ip_adapter(
|
||||
"h94/IP-Adapter",
|
||||
subfolder="sdxl_models",
|
||||
weight_name="ip-adapter-plus_sdxl_vit-h.safetensors",
|
||||
image_encoder_folder="models/image_encoder",
|
||||
)
|
||||
pipeline.set_ip_adapter_scale(0.7)
|
||||
|
||||
ip_image = load_image("source.png")
|
||||
negative_ip_image = load_image("noise.png")
|
||||
|
||||
image_embeds = prepare_ip_adapter_image_embeds(
|
||||
unet=pipeline.unet,
|
||||
image_encoder=pipeline.image_encoder,
|
||||
feature_extractor=pipeline.feature_extractor,
|
||||
ip_adapter_image=[[ip_image]],
|
||||
do_classifier_free_guidance=True,
|
||||
device="cuda",
|
||||
num_images_per_prompt=1,
|
||||
ip_adapter_negative_image=negative_ip_image,
|
||||
)
|
||||
|
||||
|
||||
prompt = "cinematic photo of a cyborg in the city, 4k, high quality, intricate, highly detailed"
|
||||
negative_prompt = "blurry, smooth, plastic"
|
||||
|
||||
image = pipeline(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
ip_adapter_image_embeds=image_embeds,
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=25,
|
||||
generator=torch.Generator(device="cpu").manual_seed(1556265306),
|
||||
).images[0]
|
||||
|
||||
image.save("result.png")
|
||||
```
|
||||
|
||||
### Asymmetric Tiling
|
||||
Stable Diffusion is not trained to generate seamless textures. However, you can use this simple script to add tiling to your generation. This script is contributed by [alexisrolland](https://github.com/alexisrolland). See more details in the [this issue](https://github.com/huggingface/diffusers/issues/556)
|
||||
|
||||
|
||||
|Generated|Tiled|
|
||||
|---|---|
|
||||
|||
|
||||
|
||||
|
||||
```py
|
||||
import torch
|
||||
from typing import Optional
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from diffusers.models.lora import LoRACompatibleConv
|
||||
|
||||
def seamless_tiling(pipeline, x_axis, y_axis):
|
||||
def asymmetric_conv2d_convforward(self, input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
|
||||
self.paddingX = (self._reversed_padding_repeated_twice[0], self._reversed_padding_repeated_twice[1], 0, 0)
|
||||
self.paddingY = (0, 0, self._reversed_padding_repeated_twice[2], self._reversed_padding_repeated_twice[3])
|
||||
working = torch.nn.functional.pad(input, self.paddingX, mode=x_mode)
|
||||
working = torch.nn.functional.pad(working, self.paddingY, mode=y_mode)
|
||||
return torch.nn.functional.conv2d(working, weight, bias, self.stride, torch.nn.modules.utils._pair(0), self.dilation, self.groups)
|
||||
x_mode = 'circular' if x_axis else 'constant'
|
||||
y_mode = 'circular' if y_axis else 'constant'
|
||||
targets = [pipeline.vae, pipeline.text_encoder, pipeline.unet]
|
||||
convolution_layers = []
|
||||
for target in targets:
|
||||
for module in target.modules():
|
||||
if isinstance(module, torch.nn.Conv2d):
|
||||
convolution_layers.append(module)
|
||||
for layer in convolution_layers:
|
||||
if isinstance(layer, LoRACompatibleConv) and layer.lora_layer is None:
|
||||
layer.lora_layer = lambda * x: 0
|
||||
layer._conv_forward = asymmetric_conv2d_convforward.__get__(layer, torch.nn.Conv2d)
|
||||
return pipeline
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True)
|
||||
pipeline.enable_model_cpu_offload()
|
||||
prompt = ["texture of a red brick wall"]
|
||||
seed = 123456
|
||||
generator = torch.Generator(device='cuda').manual_seed(seed)
|
||||
|
||||
pipeline = seamless_tiling(pipeline=pipeline, x_axis=True, y_axis=True)
|
||||
image = pipeline(
|
||||
prompt=prompt,
|
||||
width=512,
|
||||
height=512,
|
||||
num_inference_steps=20,
|
||||
guidance_scale=7,
|
||||
num_images_per_prompt=1,
|
||||
generator=generator
|
||||
).images[0]
|
||||
seamless_tiling(pipeline=pipeline, x_axis=False, y_axis=False)
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
image.save('image.png')
|
||||
```
|
||||
@@ -81,8 +81,6 @@ class CheckpointMergerPipeline(DiffusionPipeline):
|
||||
|
||||
force - Whether to ignore mismatch in model_config.json for the current models. Defaults to False.
|
||||
|
||||
variant - which variant of a pretrained model to load, e.g. "fp16" (None)
|
||||
|
||||
"""
|
||||
# Default kwargs from DiffusionPipeline
|
||||
cache_dir = kwargs.pop("cache_dir", None)
|
||||
@@ -91,7 +89,6 @@ class CheckpointMergerPipeline(DiffusionPipeline):
|
||||
proxies = kwargs.pop("proxies", None)
|
||||
local_files_only = kwargs.pop("local_files_only", False)
|
||||
token = kwargs.pop("token", None)
|
||||
variant = kwargs.pop("variant", None)
|
||||
revision = kwargs.pop("revision", None)
|
||||
torch_dtype = kwargs.pop("torch_dtype", None)
|
||||
device_map = kwargs.pop("device_map", None)
|
||||
@@ -176,10 +173,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
|
||||
# Step 3:-
|
||||
# Load the first checkpoint as a diffusion pipeline and modify its module state_dict in place
|
||||
final_pipe = DiffusionPipeline.from_pretrained(
|
||||
cached_folders[0],
|
||||
torch_dtype=torch_dtype,
|
||||
device_map=device_map,
|
||||
variant=variant,
|
||||
cached_folders[0], torch_dtype=torch_dtype, device_map=device_map
|
||||
)
|
||||
final_pipe.to(self.device)
|
||||
|
||||
|
||||
@@ -12,12 +12,12 @@ from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTok
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
DiffusionPipeline,
|
||||
DPMSolverMultistepScheduler,
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.utils import PIL_INTERPOLATION
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
@@ -77,7 +77,7 @@ def set_requires_grad(model, value):
|
||||
param.requires_grad = value
|
||||
|
||||
|
||||
class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
|
||||
class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline):
|
||||
def __init__(
|
||||
self,
|
||||
vae: AutoencoderKL,
|
||||
@@ -113,6 +113,16 @@ class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, StableDiffusionMi
|
||||
set_requires_grad(self.text_encoder, False)
|
||||
set_requires_grad(self.clip_model, False)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
def freeze_vae(self):
|
||||
set_requires_grad(self.vae, False)
|
||||
|
||||
|
||||
@@ -10,12 +10,12 @@ from transformers import CLIPImageProcessor, CLIPModel, CLIPTextModel, CLIPToken
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
DiffusionPipeline,
|
||||
DPMSolverMultistepScheduler,
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ def set_requires_grad(model, value):
|
||||
param.requires_grad = value
|
||||
|
||||
|
||||
class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
|
||||
class CLIPGuidedStableDiffusion(DiffusionPipeline):
|
||||
"""CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000
|
||||
- https://github.com/Jack000/glid-3-xl
|
||||
- https://github.dev/crowsonkb/k-diffusion
|
||||
@@ -89,6 +89,16 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
|
||||
set_requires_grad(self.text_encoder, False)
|
||||
set_requires_grad(self.clip_model, False)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
def freeze_vae(self):
|
||||
set_requires_grad(self.vae, False)
|
||||
|
||||
|
||||
@@ -12,12 +12,12 @@ from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTok
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
DiffusionPipeline,
|
||||
DPMSolverMultistepScheduler,
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.utils import PIL_INTERPOLATION, deprecate
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
@@ -125,7 +125,7 @@ def set_requires_grad(model, value):
|
||||
param.requires_grad = value
|
||||
|
||||
|
||||
class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
|
||||
class CLIPGuidedStableDiffusion(DiffusionPipeline):
|
||||
"""CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000
|
||||
- https://github.com/Jack000/glid-3-xl
|
||||
- https://github.dev/crowsonkb/k-diffusion
|
||||
@@ -163,6 +163,16 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
|
||||
set_requires_grad(self.text_encoder, False)
|
||||
set_requires_grad(self.clip_model, False)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
def freeze_vae(self):
|
||||
set_requires_grad(self.vae, False)
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.configuration_utils import FrozenDict
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import (
|
||||
@@ -33,13 +32,13 @@ from diffusers.schedulers import (
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
)
|
||||
from diffusers.utils import deprecate, logging
|
||||
from diffusers.utils import deprecate, is_accelerate_available, logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
class ComposableStableDiffusionPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion.
|
||||
|
||||
@@ -165,6 +164,62 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin)
|
||||
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
||||
|
||||
def enable_vae_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding.
|
||||
|
||||
When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
|
||||
steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.vae.enable_slicing()
|
||||
|
||||
def disable_vae_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_slicing()
|
||||
|
||||
def enable_sequential_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
|
||||
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
|
||||
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
|
||||
"""
|
||||
if is_accelerate_available():
|
||||
from accelerate import cpu_offload
|
||||
else:
|
||||
raise ImportError("Please install accelerate via `pip install accelerate`")
|
||||
|
||||
device = torch.device(f"cuda:{gpu_id}")
|
||||
|
||||
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
|
||||
if cpu_offloaded_model is not None:
|
||||
cpu_offload(cpu_offloaded_model, device)
|
||||
|
||||
if self.safety_checker is not None:
|
||||
# TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
|
||||
# fix by only offloading self.safety_checker for now
|
||||
cpu_offload(self.safety_checker.vision_model, device)
|
||||
|
||||
@property
|
||||
def _execution_device(self):
|
||||
r"""
|
||||
Returns the device on which the pipeline's models will be executed. After calling
|
||||
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
|
||||
hooks.
|
||||
"""
|
||||
if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
|
||||
return self.device
|
||||
for module in self.unet.modules():
|
||||
if (
|
||||
hasattr(module, "_hf_hook")
|
||||
and hasattr(module._hf_hook, "execution_device")
|
||||
and module._hf_hook.execution_device is not None
|
||||
):
|
||||
return torch.device(module._hf_hook.execution_device)
|
||||
return self.device
|
||||
|
||||
def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
|
||||
r"""
|
||||
Encodes the prompt into text encoder hidden states.
|
||||
|
||||
@@ -10,7 +10,6 @@ from diffusers.image_processor import VaeImageProcessor
|
||||
from diffusers.loaders import LoraLoaderMixin
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
||||
from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import KarrasDiffusionSchedulers
|
||||
@@ -194,7 +193,7 @@ def retrieve_timesteps(
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
|
||||
class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, LoraLoaderMixin):
|
||||
class GlueGenStableDiffusionPipeline(DiffusionPipeline, LoraLoaderMixin):
|
||||
def __init__(
|
||||
self,
|
||||
vae: AutoencoderKL,
|
||||
@@ -242,6 +241,35 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
|
||||
)
|
||||
self.language_adapter.load_state_dict(torch.load(model_path))
|
||||
|
||||
def enable_vae_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
||||
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.vae.enable_slicing()
|
||||
|
||||
def disable_vae_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_slicing()
|
||||
|
||||
def enable_vae_tiling(self):
|
||||
r"""
|
||||
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
||||
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
"""
|
||||
self.vae.enable_tiling()
|
||||
|
||||
def disable_vae_tiling(self):
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_tiling()
|
||||
|
||||
def _adapt_language(self, prompt_embeds: torch.FloatTensor):
|
||||
prompt_embeds = prompt_embeds / 3
|
||||
prompt_embeds = self.language_adapter(prompt_embeds) * (self.tensor_norm / 2)
|
||||
@@ -516,6 +544,32 @@ class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, Lo
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
return latents
|
||||
|
||||
def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
|
||||
r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
|
||||
|
||||
The suffixes after the scaling factors represent the stages where they are being applied.
|
||||
|
||||
Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
|
||||
that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
|
||||
|
||||
Args:
|
||||
s1 (`float`):
|
||||
Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
s2 (`float`):
|
||||
Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
|
||||
b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
|
||||
"""
|
||||
if not hasattr(self, "unet"):
|
||||
raise ValueError("The pipeline must have `unet` for using FreeU.")
|
||||
self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
|
||||
|
||||
def disable_freeu(self):
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
||||
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
||||
"""
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
"""
|
||||
modeled after the textual_inversion.py / train_dreambooth.py and the work
|
||||
of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
|
||||
modeled after the textual_inversion.py / train_dreambooth.py and the work
|
||||
of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
|
||||
"""
|
||||
|
||||
import inspect
|
||||
import warnings
|
||||
from typing import List, Optional, Union
|
||||
@@ -20,7 +19,6 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
@@ -58,7 +56,7 @@ def preprocess(image):
|
||||
return 2.0 * image - 1.0
|
||||
|
||||
|
||||
class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for imagic image editing.
|
||||
See paper here: https://arxiv.org/pdf/2210.09276.pdf
|
||||
@@ -107,6 +105,31 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
When this option is enabled, the attention module will split the input tensor in slices, to compute attention
|
||||
in several steps. This is useful to save some memory in exchange for a small speed decrease.
|
||||
Args:
|
||||
slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
|
||||
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
|
||||
a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
|
||||
`attention_head_dim` must be a multiple of `slice_size`.
|
||||
"""
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
r"""
|
||||
Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
|
||||
back to computing attention in one step.
|
||||
"""
|
||||
# set slice_size = `None` to disable `attention slicing`
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
def train(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
@@ -323,9 +346,8 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
Args:
|
||||
alpha (`float`, *optional*, defaults to 1.2):
|
||||
The interpolation factor between the original and optimized text embeddings. A value closer to 0
|
||||
will resemble the original input image.
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
@@ -339,18 +361,22 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
|
||||
deterministic.
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
||||
|
||||
@@ -129,6 +129,33 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline):
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
|
||||
When this option is enabled, the attention module will split the input tensor in slices, to compute attention
|
||||
in several steps. This is useful to save some memory in exchange for a small speed decrease.
|
||||
|
||||
Args:
|
||||
slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
|
||||
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
|
||||
a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
|
||||
`attention_head_dim` must be a multiple of `slice_size`.
|
||||
"""
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
r"""
|
||||
Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
|
||||
back to computing attention in one step.
|
||||
"""
|
||||
# set slice_size = `None` to disable `attention slicing`
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
|
||||
@@ -24,7 +24,7 @@ from diffusers.image_processor import VaeImageProcessor
|
||||
from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import KarrasDiffusionSchedulers
|
||||
@@ -52,9 +52,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
||||
return noise_cfg
|
||||
|
||||
|
||||
class InstaFlowPipeline(
|
||||
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
|
||||
):
|
||||
class InstaFlowPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Rectified Flow and Euler discretization.
|
||||
This customized pipeline is based on StableDiffusionPipeline from the official Diffusers library (0.21.4)
|
||||
@@ -182,6 +180,35 @@ class InstaFlowPipeline(
|
||||
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
||||
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
||||
|
||||
def enable_vae_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
||||
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.vae.enable_slicing()
|
||||
|
||||
def disable_vae_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_slicing()
|
||||
|
||||
def enable_vae_tiling(self):
|
||||
r"""
|
||||
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
||||
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
"""
|
||||
self.vae.enable_tiling()
|
||||
|
||||
def disable_vae_tiling(self):
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_tiling()
|
||||
|
||||
def _encode_prompt(
|
||||
self,
|
||||
prompt,
|
||||
|
||||
@@ -7,9 +7,9 @@ import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.configuration_utils import FrozenDict
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
@@ -46,7 +46,7 @@ def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
|
||||
return v2
|
||||
|
||||
|
||||
class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
class StableDiffusionWalkPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion.
|
||||
|
||||
@@ -120,6 +120,33 @@ class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
|
||||
When this option is enabled, the attention module will split the input tensor in slices, to compute attention
|
||||
in several steps. This is useful to save some memory in exchange for a small speed decrease.
|
||||
|
||||
Args:
|
||||
slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
|
||||
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
|
||||
a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
|
||||
`attention_head_dim` must be a multiple of `slice_size`.
|
||||
"""
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
r"""
|
||||
Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
|
||||
back to computing attention in one step.
|
||||
"""
|
||||
# set slice_size = `None` to disable `attention slicing`
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
|
||||
@@ -26,8 +26,9 @@ from diffusers.configuration_utils import FrozenDict
|
||||
from diffusers.image_processor import VaeImageProcessor
|
||||
from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.models.attention_processor import FusedAttnProcessor2_0
|
||||
from diffusers.models.lora import LoRALinearLayer, adjust_lora_scale_text_encoder
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import KarrasDiffusionSchedulers
|
||||
@@ -414,12 +415,7 @@ def retrieve_timesteps(
|
||||
|
||||
|
||||
class IPAdapterFaceIDStableDiffusionPipeline(
|
||||
DiffusionPipeline,
|
||||
StableDiffusionMixin,
|
||||
TextualInversionLoaderMixin,
|
||||
LoraLoaderMixin,
|
||||
IPAdapterMixin,
|
||||
FromSingleFileMixin,
|
||||
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
|
||||
):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion.
|
||||
@@ -731,6 +727,35 @@ class IPAdapterFaceIDStableDiffusionPipeline(
|
||||
if isinstance(attn_processor, (LoRAIPAdapterAttnProcessor, LoRAIPAdapterAttnProcessor2_0)):
|
||||
attn_processor.scale = scale
|
||||
|
||||
def enable_vae_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
||||
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.vae.enable_slicing()
|
||||
|
||||
def disable_vae_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_slicing()
|
||||
|
||||
def enable_vae_tiling(self):
|
||||
r"""
|
||||
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
||||
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
"""
|
||||
self.vae.enable_tiling()
|
||||
|
||||
def disable_vae_tiling(self):
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_tiling()
|
||||
|
||||
def _encode_prompt(
|
||||
self,
|
||||
prompt,
|
||||
@@ -823,7 +848,7 @@ class IPAdapterFaceIDStableDiffusionPipeline(
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
|
||||
if prompt_embeds is None:
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
if isinstance(self, TextualInversionLoaderMixin):
|
||||
prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
|
||||
|
||||
@@ -905,7 +930,7 @@ class IPAdapterFaceIDStableDiffusionPipeline(
|
||||
else:
|
||||
uncond_tokens = negative_prompt
|
||||
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
if isinstance(self, TextualInversionLoaderMixin):
|
||||
uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
|
||||
|
||||
@@ -1055,6 +1080,93 @@ class IPAdapterFaceIDStableDiffusionPipeline(
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
return latents
|
||||
|
||||
def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
|
||||
r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
|
||||
|
||||
The suffixes after the scaling factors represent the stages where they are being applied.
|
||||
|
||||
Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
|
||||
that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
|
||||
|
||||
Args:
|
||||
s1 (`float`):
|
||||
Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
s2 (`float`):
|
||||
Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
|
||||
b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
|
||||
"""
|
||||
if not hasattr(self, "unet"):
|
||||
raise ValueError("The pipeline must have `unet` for using FreeU.")
|
||||
self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
|
||||
|
||||
def disable_freeu(self):
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
|
||||
def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
|
||||
"""
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
|
||||
key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This API is 🧪 experimental.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
unet (`bool`, defaults to `True`): To apply fusion on the UNet.
|
||||
vae (`bool`, defaults to `True`): To apply fusion on the VAE.
|
||||
"""
|
||||
self.fusing_unet = False
|
||||
self.fusing_vae = False
|
||||
|
||||
if unet:
|
||||
self.fusing_unet = True
|
||||
self.unet.fuse_qkv_projections()
|
||||
self.unet.set_attn_processor(FusedAttnProcessor2_0())
|
||||
|
||||
if vae:
|
||||
if not isinstance(self.vae, AutoencoderKL):
|
||||
raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
|
||||
|
||||
self.fusing_vae = True
|
||||
self.vae.fuse_qkv_projections()
|
||||
self.vae.set_attn_processor(FusedAttnProcessor2_0())
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
|
||||
def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
|
||||
"""Disable QKV projection fusion if enabled.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This API is 🧪 experimental.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
unet (`bool`, defaults to `True`): To apply fusion on the UNet.
|
||||
vae (`bool`, defaults to `True`): To apply fusion on the VAE.
|
||||
|
||||
"""
|
||||
if unet:
|
||||
if not self.fusing_unet:
|
||||
logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
|
||||
else:
|
||||
self.unet.unfuse_qkv_projections()
|
||||
self.fusing_unet = False
|
||||
|
||||
if vae:
|
||||
if not self.fusing_vae:
|
||||
logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
|
||||
else:
|
||||
self.vae.unfuse_qkv_projections()
|
||||
self.fusing_vae = False
|
||||
|
||||
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
||||
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
||||
"""
|
||||
|
||||
@@ -440,7 +440,7 @@ def betas_for_alpha_bar(
|
||||
return math.exp(t * -12.0)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
|
||||
raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
|
||||
|
||||
betas = []
|
||||
for i in range(num_diffusion_timesteps):
|
||||
@@ -513,7 +513,9 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
|
||||
there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
|
||||
otherwise it uses the alpha value at step 0.
|
||||
steps_offset (`int`, defaults to 0):
|
||||
An offset added to the inference steps, as required by some model families.
|
||||
An offset added to the inference steps. You can use a combination of `offset=1` and
|
||||
`set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
|
||||
Diffusion.
|
||||
prediction_type (`str`, defaults to `epsilon`, *optional*):
|
||||
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
|
||||
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
|
||||
|
||||
@@ -9,7 +9,7 @@ from diffusers.image_processor import VaeImageProcessor
|
||||
from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import LCMScheduler
|
||||
from diffusers.utils import (
|
||||
@@ -190,7 +190,7 @@ def slerp(
|
||||
|
||||
|
||||
class LatentConsistencyModelWalkPipeline(
|
||||
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
|
||||
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
|
||||
):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using a latent consistency model.
|
||||
@@ -273,6 +273,67 @@ class LatentConsistencyModelWalkPipeline(
|
||||
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
||||
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
|
||||
def enable_vae_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
||||
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.vae.enable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
|
||||
def disable_vae_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
|
||||
def enable_vae_tiling(self):
|
||||
r"""
|
||||
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
||||
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
"""
|
||||
self.vae.enable_tiling()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
|
||||
def disable_vae_tiling(self):
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_tiling()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
|
||||
def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
|
||||
r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
|
||||
|
||||
The suffixes after the scaling factors represent the stages where they are being applied.
|
||||
|
||||
Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
|
||||
that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
|
||||
|
||||
Args:
|
||||
s1 (`float`):
|
||||
Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
s2 (`float`):
|
||||
Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
|
||||
b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
|
||||
"""
|
||||
if not hasattr(self, "unet"):
|
||||
raise ValueError("The pipeline must have `unet` for using FreeU.")
|
||||
self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
|
||||
def disable_freeu(self):
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
|
||||
def encode_prompt(
|
||||
self,
|
||||
@@ -334,7 +395,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
|
||||
if prompt_embeds is None:
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
if isinstance(self, TextualInversionLoaderMixin):
|
||||
prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
|
||||
|
||||
@@ -416,7 +477,7 @@ class LatentConsistencyModelWalkPipeline(
|
||||
else:
|
||||
uncond_tokens = negative_prompt
|
||||
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
if isinstance(self, TextualInversionLoaderMixin):
|
||||
uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
|
||||
|
||||
|
||||
@@ -348,7 +348,7 @@ def betas_for_alpha_bar(
|
||||
return math.exp(t * -12.0)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
|
||||
raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
|
||||
|
||||
betas = []
|
||||
for i in range(num_diffusion_timesteps):
|
||||
@@ -418,7 +418,9 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
|
||||
otherwise it uses the alpha value at step 0.
|
||||
steps_offset (`int`, defaults to 0):
|
||||
An offset added to the inference steps, as required by some model families.
|
||||
An offset added to the inference steps. You can use a combination of `offset=1` and
|
||||
`set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
|
||||
Diffusion.
|
||||
prediction_type (`str`, defaults to `epsilon`, *optional*):
|
||||
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
|
||||
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
|
||||
|
||||
@@ -35,7 +35,6 @@ from diffusers.models.attention import Attention, GatedSelfAttentionDense
|
||||
from diffusers.models.attention_processor import AttnProcessor2_0
|
||||
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
||||
from diffusers.pipelines import DiffusionPipeline
|
||||
from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import KarrasDiffusionSchedulers
|
||||
@@ -268,12 +267,7 @@ class AttnProcessorWithHook(AttnProcessor2_0):
|
||||
|
||||
|
||||
class LLMGroundedDiffusionPipeline(
|
||||
DiffusionPipeline,
|
||||
StableDiffusionMixin,
|
||||
TextualInversionLoaderMixin,
|
||||
LoraLoaderMixin,
|
||||
IPAdapterMixin,
|
||||
FromSingleFileMixin,
|
||||
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
|
||||
):
|
||||
r"""
|
||||
Pipeline for layout-grounded text-to-image generation using LLM-grounded Diffusion (LMD+): https://arxiv.org/pdf/2305.13655.pdf.
|
||||
@@ -1186,6 +1180,39 @@ class LLMGroundedDiffusionPipeline(
|
||||
# Below are methods copied from StableDiffusionPipeline
|
||||
# The design choice of not inheriting from StableDiffusionPipeline is discussed here: https://github.com/huggingface/diffusers/pull/5993#issuecomment-1834258517
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
|
||||
def enable_vae_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
||||
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.vae.enable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
|
||||
def disable_vae_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
|
||||
def enable_vae_tiling(self):
|
||||
r"""
|
||||
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
||||
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
"""
|
||||
self.vae.enable_tiling()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
|
||||
def disable_vae_tiling(self):
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_tiling()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
|
||||
def _encode_prompt(
|
||||
self,
|
||||
@@ -1280,7 +1307,7 @@ class LLMGroundedDiffusionPipeline(
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
|
||||
if prompt_embeds is None:
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
if isinstance(self, TextualInversionLoaderMixin):
|
||||
prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
|
||||
|
||||
@@ -1364,7 +1391,7 @@ class LLMGroundedDiffusionPipeline(
|
||||
else:
|
||||
uncond_tokens = negative_prompt
|
||||
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
if isinstance(self, TextualInversionLoaderMixin):
|
||||
uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
|
||||
|
||||
@@ -1495,6 +1522,34 @@ class LLMGroundedDiffusionPipeline(
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
return latents
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
|
||||
def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
|
||||
r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
|
||||
|
||||
The suffixes after the scaling factors represent the stages where they are being applied.
|
||||
|
||||
Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
|
||||
that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
|
||||
|
||||
Args:
|
||||
s1 (`float`):
|
||||
Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
s2 (`float`):
|
||||
Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
|
||||
b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
|
||||
"""
|
||||
if not hasattr(self, "unet"):
|
||||
raise ValueError("The pipeline must have `unet` for using FreeU.")
|
||||
self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
|
||||
def disable_freeu(self):
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
||||
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
||||
"""
|
||||
|
||||
@@ -13,12 +13,13 @@ from diffusers.configuration_utils import FrozenDict
|
||||
from diffusers.image_processor import VaeImageProcessor
|
||||
from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import KarrasDiffusionSchedulers
|
||||
from diffusers.utils import (
|
||||
PIL_INTERPOLATION,
|
||||
deprecate,
|
||||
is_accelerate_available,
|
||||
is_accelerate_version,
|
||||
logging,
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
@@ -409,7 +410,7 @@ def preprocess_mask(mask, batch_size, scale_factor=8):
|
||||
|
||||
|
||||
class StableDiffusionLongPromptWeightingPipeline(
|
||||
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
|
||||
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
|
||||
):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
|
||||
@@ -533,6 +534,112 @@ class StableDiffusionLongPromptWeightingPipeline(
|
||||
requires_safety_checker=requires_safety_checker,
|
||||
)
|
||||
|
||||
def enable_vae_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding.
|
||||
|
||||
When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
|
||||
steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.vae.enable_slicing()
|
||||
|
||||
def disable_vae_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_slicing()
|
||||
|
||||
def enable_vae_tiling(self):
|
||||
r"""
|
||||
Enable tiled VAE decoding.
|
||||
|
||||
When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
|
||||
several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
|
||||
"""
|
||||
self.vae.enable_tiling()
|
||||
|
||||
def disable_vae_tiling(self):
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_tiling()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
|
||||
def enable_sequential_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
|
||||
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
|
||||
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
|
||||
Note that offloading happens on a submodule basis. Memory savings are higher than with
|
||||
`enable_model_cpu_offload`, but performance is lower.
|
||||
"""
|
||||
if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
|
||||
from accelerate import cpu_offload
|
||||
else:
|
||||
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
|
||||
|
||||
device = torch.device(f"cuda:{gpu_id}")
|
||||
|
||||
if self.device.type != "cpu":
|
||||
self.to("cpu", silence_dtype_warnings=True)
|
||||
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
|
||||
|
||||
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
|
||||
cpu_offload(cpu_offloaded_model, device)
|
||||
|
||||
if self.safety_checker is not None:
|
||||
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
|
||||
def enable_model_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
||||
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
|
||||
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
|
||||
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
|
||||
"""
|
||||
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
|
||||
from accelerate import cpu_offload_with_hook
|
||||
else:
|
||||
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
|
||||
|
||||
device = torch.device(f"cuda:{gpu_id}")
|
||||
|
||||
if self.device.type != "cpu":
|
||||
self.to("cpu", silence_dtype_warnings=True)
|
||||
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
|
||||
|
||||
hook = None
|
||||
for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
|
||||
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
|
||||
|
||||
if self.safety_checker is not None:
|
||||
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
|
||||
|
||||
# We'll offload the last model manually.
|
||||
self.final_offload_hook = hook
|
||||
|
||||
@property
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
|
||||
def _execution_device(self):
|
||||
r"""
|
||||
Returns the device on which the pipeline's models will be executed. After calling
|
||||
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
|
||||
hooks.
|
||||
"""
|
||||
if not hasattr(self.unet, "_hf_hook"):
|
||||
return self.device
|
||||
for module in self.unet.modules():
|
||||
if (
|
||||
hasattr(module, "_hf_hook")
|
||||
and hasattr(module._hf_hook, "execution_device")
|
||||
and module._hf_hook.execution_device is not None
|
||||
):
|
||||
return torch.device(module._hf_hook.execution_device)
|
||||
return self.device
|
||||
|
||||
def _encode_prompt(
|
||||
self,
|
||||
prompt,
|
||||
|
||||
@@ -26,11 +26,11 @@ from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMix
|
||||
from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
|
||||
from diffusers.models.attention_processor import (
|
||||
AttnProcessor2_0,
|
||||
FusedAttnProcessor2_0,
|
||||
LoRAAttnProcessor2_0,
|
||||
LoRAXFormersAttnProcessor,
|
||||
XFormersAttnProcessor,
|
||||
)
|
||||
from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
|
||||
from diffusers.schedulers import KarrasDiffusionSchedulers
|
||||
from diffusers.utils import (
|
||||
@@ -545,12 +545,7 @@ def retrieve_timesteps(
|
||||
|
||||
|
||||
class SDXLLongPromptWeightingPipeline(
|
||||
DiffusionPipeline,
|
||||
StableDiffusionMixin,
|
||||
FromSingleFileMixin,
|
||||
IPAdapterMixin,
|
||||
LoraLoaderMixin,
|
||||
TextualInversionLoaderMixin,
|
||||
DiffusionPipeline, FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion XL.
|
||||
@@ -654,6 +649,39 @@ class SDXLLongPromptWeightingPipeline(
|
||||
else:
|
||||
self.watermark = None
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
|
||||
def enable_vae_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
||||
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.vae.enable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
|
||||
def disable_vae_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
|
||||
def enable_vae_tiling(self):
|
||||
r"""
|
||||
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
||||
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
"""
|
||||
self.vae.enable_tiling()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
|
||||
def disable_vae_tiling(self):
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_tiling()
|
||||
|
||||
def enable_model_cpu_offload(self, gpu_id=0):
|
||||
r"""
|
||||
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
||||
@@ -761,7 +789,7 @@ class SDXLLongPromptWeightingPipeline(
|
||||
|
||||
if prompt_embeds is None:
|
||||
prompt_2 = prompt_2 or prompt
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
prompt_embeds_list = []
|
||||
prompts = [prompt, prompt_2]
|
||||
for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
|
||||
@@ -1002,6 +1030,95 @@ class SDXLLongPromptWeightingPipeline(
|
||||
"If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
|
||||
)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
|
||||
def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
|
||||
r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
|
||||
|
||||
The suffixes after the scaling factors represent the stages where they are being applied.
|
||||
|
||||
Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
|
||||
that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
|
||||
|
||||
Args:
|
||||
s1 (`float`):
|
||||
Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
s2 (`float`):
|
||||
Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
|
||||
b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
|
||||
"""
|
||||
if not hasattr(self, "unet"):
|
||||
raise ValueError("The pipeline must have `unet` for using FreeU.")
|
||||
self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
|
||||
def disable_freeu(self):
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
|
||||
def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
|
||||
"""
|
||||
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
|
||||
key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This API is 🧪 experimental.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
unet (`bool`, defaults to `True`): To apply fusion on the UNet.
|
||||
vae (`bool`, defaults to `True`): To apply fusion on the VAE.
|
||||
"""
|
||||
self.fusing_unet = False
|
||||
self.fusing_vae = False
|
||||
|
||||
if unet:
|
||||
self.fusing_unet = True
|
||||
self.unet.fuse_qkv_projections()
|
||||
self.unet.set_attn_processor(FusedAttnProcessor2_0())
|
||||
|
||||
if vae:
|
||||
if not isinstance(self.vae, AutoencoderKL):
|
||||
raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
|
||||
|
||||
self.fusing_vae = True
|
||||
self.vae.fuse_qkv_projections()
|
||||
self.vae.set_attn_processor(FusedAttnProcessor2_0())
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
|
||||
def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
|
||||
"""Disable QKV projection fusion if enabled.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This API is 🧪 experimental.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
unet (`bool`, defaults to `True`): To apply fusion on the UNet.
|
||||
vae (`bool`, defaults to `True`): To apply fusion on the VAE.
|
||||
|
||||
"""
|
||||
if unet:
|
||||
if not self.fusing_unet:
|
||||
logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
|
||||
else:
|
||||
self.unet.unfuse_qkv_projections()
|
||||
self.fusing_unet = False
|
||||
|
||||
if vae:
|
||||
if not self.fusing_vae:
|
||||
logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
|
||||
else:
|
||||
self.vae.unfuse_qkv_projections()
|
||||
self.fusing_vae = False
|
||||
|
||||
def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
|
||||
# get the original timestep using init_timestep
|
||||
if denoising_start is None:
|
||||
@@ -1649,7 +1766,7 @@ class SDXLLongPromptWeightingPipeline(
|
||||
|
||||
# 4. Prepare timesteps
|
||||
def denoising_value_valid(dnv):
|
||||
return isinstance(dnv, float) and 0 < dnv < 1
|
||||
return isinstance(self.denoising_end, float) and 0 < dnv < 1
|
||||
|
||||
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
|
||||
if image is not None:
|
||||
@@ -1657,7 +1774,7 @@ class SDXLLongPromptWeightingPipeline(
|
||||
num_inference_steps,
|
||||
strength,
|
||||
device,
|
||||
denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
|
||||
denoising_start=self.denoising_start if denoising_value_valid else None,
|
||||
)
|
||||
|
||||
# check that number of inference steps is not < 1 - as this doesn't make sense
|
||||
|
||||
@@ -40,7 +40,7 @@ from diffusers.utils import BaseOutput, check_min_version
|
||||
|
||||
|
||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||
check_min_version("0.28.0.dev0")
|
||||
check_min_version("0.27.0.dev0")
|
||||
|
||||
|
||||
class MarigoldDepthOutput(BaseOutput):
|
||||
@@ -50,14 +50,14 @@ class MarigoldDepthOutput(BaseOutput):
|
||||
Args:
|
||||
depth_np (`np.ndarray`):
|
||||
Predicted depth map, with depth values in the range of [0, 1].
|
||||
depth_colored (`None` or `PIL.Image.Image`):
|
||||
depth_colored (`PIL.Image.Image`):
|
||||
Colorized depth map, with the shape of [3, H, W] and values in [0, 1].
|
||||
uncertainty (`None` or `np.ndarray`):
|
||||
Uncalibrated uncertainty(MAD, median absolute deviation) coming from ensembling.
|
||||
"""
|
||||
|
||||
depth_np: np.ndarray
|
||||
depth_colored: Union[None, Image.Image]
|
||||
depth_colored: Image.Image
|
||||
uncertainty: Union[None, np.ndarray]
|
||||
|
||||
|
||||
@@ -139,15 +139,14 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
If set to 0, the script will automatically decide the proper batch size.
|
||||
show_progress_bar (`bool`, *optional*, defaults to `True`):
|
||||
Display a progress bar of diffusion denoising.
|
||||
color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized depth map generation):
|
||||
color_map (`str`, *optional*, defaults to `"Spectral"`):
|
||||
Colormap used to colorize the depth map.
|
||||
ensemble_kwargs (`dict`, *optional*, defaults to `None`):
|
||||
Arguments for detailed ensembling settings.
|
||||
Returns:
|
||||
`MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
|
||||
- **depth_np** (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1]
|
||||
- **depth_colored** (`None` or `PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and
|
||||
values in [0, 1]. None if `color_map` is `None`
|
||||
- **depth_colored** (`PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and values in [0, 1]
|
||||
- **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
|
||||
coming from ensembling. None if `ensemble_size = 1`
|
||||
"""
|
||||
@@ -234,15 +233,12 @@ class MarigoldPipeline(DiffusionPipeline):
|
||||
depth_pred = depth_pred.clip(0, 1)
|
||||
|
||||
# Colorize
|
||||
if color_map is not None:
|
||||
depth_colored = self.colorize_depth_maps(
|
||||
depth_pred, 0, 1, cmap=color_map
|
||||
).squeeze() # [3, H, W], value in (0, 1)
|
||||
depth_colored = (depth_colored * 255).astype(np.uint8)
|
||||
depth_colored_hwc = self.chw2hwc(depth_colored)
|
||||
depth_colored_img = Image.fromarray(depth_colored_hwc)
|
||||
else:
|
||||
depth_colored_img = None
|
||||
depth_colored = self.colorize_depth_maps(
|
||||
depth_pred, 0, 1, cmap=color_map
|
||||
).squeeze() # [3, H, W], value in (0, 1)
|
||||
depth_colored = (depth_colored * 255).astype(np.uint8)
|
||||
depth_colored_hwc = self.chw2hwc(depth_colored)
|
||||
depth_colored_img = Image.fromarray(depth_colored_hwc)
|
||||
return MarigoldDepthOutput(
|
||||
depth_np=depth_pred,
|
||||
depth_colored=depth_colored_img,
|
||||
|
||||
@@ -12,7 +12,7 @@ from tqdm.auto import tqdm
|
||||
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
|
||||
@@ -264,7 +264,7 @@ class MaskWeightsBuilder:
|
||||
return torch.tile(torch.tensor(weights), (self.nbatch, self.latent_space_dim, 1, 1))
|
||||
|
||||
|
||||
class StableDiffusionCanvasPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
class StableDiffusionCanvasPipeline(DiffusionPipeline):
|
||||
"""Stable Diffusion pipeline that mixes several diffusers in the same canvas"""
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -11,9 +11,9 @@ from transformers import (
|
||||
pipeline,
|
||||
)
|
||||
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.configuration_utils import FrozenDict
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
@@ -48,7 +48,7 @@ def translate_prompt(prompt, translation_tokenizer, translation_model, device):
|
||||
return en_trans[0]
|
||||
|
||||
|
||||
class MultilingualStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
|
||||
class MultilingualStableDiffusion(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion in different languages.
|
||||
|
||||
@@ -135,6 +135,33 @@ class MultilingualStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
|
||||
When this option is enabled, the attention module will split the input tensor in slices, to compute attention
|
||||
in several steps. This is useful to save some memory in exchange for a small speed decrease.
|
||||
|
||||
Args:
|
||||
slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
|
||||
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
|
||||
a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
|
||||
`attention_head_dim` must be a multiple of `slice_size`.
|
||||
"""
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
r"""
|
||||
Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
|
||||
back to computing attention in one step.
|
||||
"""
|
||||
# set slice_size = `None` to disable `attention slicing`
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -23,12 +24,11 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV
|
||||
|
||||
from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
from diffusers.models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel, UNetMotionModel
|
||||
from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel, UNetMotionModel
|
||||
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
||||
from diffusers.models.unets.unet_motion_model import MotionAdapter
|
||||
from diffusers.pipelines.animatediff.pipeline_output import AnimateDiffPipelineOutput
|
||||
from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.schedulers import (
|
||||
DDIMScheduler,
|
||||
DPMSolverMultistepScheduler,
|
||||
@@ -37,7 +37,7 @@ from diffusers.schedulers import (
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
)
|
||||
from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
|
||||
from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
|
||||
from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
|
||||
|
||||
|
||||
@@ -91,8 +91,10 @@ EXAMPLE_DOC_STRING = """
|
||||
"""
|
||||
|
||||
|
||||
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
|
||||
def tensor2vid(video: torch.Tensor, processor, output_type="np"):
|
||||
# Based on:
|
||||
# https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
|
||||
|
||||
batch_size, channels, num_frames, height, width = video.shape
|
||||
outputs = []
|
||||
for batch_idx in range(batch_size):
|
||||
@@ -101,21 +103,15 @@ def tensor2vid(video: torch.Tensor, processor, output_type="np"):
|
||||
|
||||
outputs.append(batch_output)
|
||||
|
||||
if output_type == "np":
|
||||
outputs = np.stack(outputs)
|
||||
|
||||
elif output_type == "pt":
|
||||
outputs = torch.stack(outputs)
|
||||
|
||||
elif not output_type == "pil":
|
||||
raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class AnimateDiffControlNetPipeline(
|
||||
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin
|
||||
):
|
||||
@dataclass
|
||||
class AnimateDiffControlNetPipelineOutput(BaseOutput):
|
||||
frames: Union[torch.Tensor, np.ndarray]
|
||||
|
||||
|
||||
class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
|
||||
r"""
|
||||
Pipeline for text-to-video generation.
|
||||
|
||||
@@ -251,7 +247,7 @@ class AnimateDiffControlNetPipeline(
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
|
||||
if prompt_embeds is None:
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
if isinstance(self, TextualInversionLoaderMixin):
|
||||
prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
|
||||
|
||||
@@ -333,7 +329,7 @@ class AnimateDiffControlNetPipeline(
|
||||
else:
|
||||
uncond_tokens = negative_prompt
|
||||
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
if isinstance(self, TextualInversionLoaderMixin):
|
||||
uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
|
||||
|
||||
@@ -386,41 +382,6 @@ class AnimateDiffControlNetPipeline(
|
||||
uncond_image_embeds = torch.zeros_like(image_embeds)
|
||||
return image_embeds, uncond_image_embeds
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
|
||||
def prepare_ip_adapter_image_embeds(
|
||||
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
|
||||
):
|
||||
if ip_adapter_image_embeds is None:
|
||||
if not isinstance(ip_adapter_image, list):
|
||||
ip_adapter_image = [ip_adapter_image]
|
||||
|
||||
if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
|
||||
raise ValueError(
|
||||
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
||||
)
|
||||
|
||||
image_embeds = []
|
||||
for single_ip_adapter_image, image_proj_layer in zip(
|
||||
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
||||
):
|
||||
output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
|
||||
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
||||
single_ip_adapter_image, device, 1, output_hidden_state
|
||||
)
|
||||
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
||||
single_negative_image_embeds = torch.stack(
|
||||
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
||||
)
|
||||
|
||||
if self.do_classifier_free_guidance:
|
||||
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
||||
single_image_embeds = single_image_embeds.to(device)
|
||||
|
||||
image_embeds.append(single_image_embeds)
|
||||
else:
|
||||
image_embeds = ip_adapter_image_embeds
|
||||
return image_embeds
|
||||
|
||||
# Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
|
||||
def decode_latents(self, latents):
|
||||
latents = 1 / self.vae.config.scaling_factor * latents
|
||||
@@ -445,6 +406,67 @@ class AnimateDiffControlNetPipeline(
|
||||
video = video.float()
|
||||
return video
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
|
||||
def enable_vae_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
||||
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.vae.enable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
|
||||
def disable_vae_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
|
||||
def enable_vae_tiling(self):
|
||||
r"""
|
||||
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
||||
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
"""
|
||||
self.vae.enable_tiling()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
|
||||
def disable_vae_tiling(self):
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_tiling()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
|
||||
def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
|
||||
r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
|
||||
|
||||
The suffixes after the scaling factors represent the stages where they are being applied.
|
||||
|
||||
Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
|
||||
that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
|
||||
|
||||
Args:
|
||||
s1 (`float`):
|
||||
Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
s2 (`float`):
|
||||
Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
|
||||
b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
|
||||
"""
|
||||
if not hasattr(self, "unet"):
|
||||
raise ValueError("The pipeline must have `unet` for using FreeU.")
|
||||
self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
|
||||
def disable_freeu(self):
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
|
||||
def prepare_extra_step_kwargs(self, generator, eta):
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
@@ -745,7 +767,6 @@ class AnimateDiffControlNetPipeline(
|
||||
prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
ip_adapter_image: Optional[PipelineImageInput] = None,
|
||||
ip_adapter_image_embeds: Optional[PipelineImageInput] = None,
|
||||
conditioning_frames: Optional[List[PipelineImageInput]] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
@@ -800,11 +821,6 @@ class AnimateDiffControlNetPipeline(
|
||||
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
||||
ip_adapter_image (`PipelineImageInput`, *optional*):
|
||||
Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
conditioning_frames (`List[PipelineImageInput]`, *optional*):
|
||||
The ControlNet input condition to provide guidance to the `unet` for generation. If multiple ControlNets
|
||||
are specified, images must be passed as a list such that each element of the list can be correctly
|
||||
@@ -845,8 +861,8 @@ class AnimateDiffControlNetPipeline(
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
[`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
|
||||
[`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
|
||||
returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
|
||||
"""
|
||||
|
||||
@@ -949,9 +965,9 @@ class AnimateDiffControlNetPipeline(
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
if ip_adapter_image is not None:
|
||||
image_embeds = self.prepare_ip_adapter_image_embeds(
|
||||
ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt
|
||||
)
|
||||
image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_videos_per_prompt)
|
||||
if self.do_classifier_free_guidance:
|
||||
image_embeds = torch.cat([negative_image_embeds, image_embeds])
|
||||
|
||||
if isinstance(controlnet, ControlNetModel):
|
||||
conditioning_frames = self.prepare_image(
|
||||
@@ -1007,11 +1023,7 @@ class AnimateDiffControlNetPipeline(
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
# 7. Add image embeds for IP-Adapter
|
||||
added_cond_kwargs = (
|
||||
{"image_embeds": image_embeds}
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
|
||||
else None
|
||||
)
|
||||
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
|
||||
|
||||
# 7.1 Create tensor stating which controlnets to keep
|
||||
controlnet_keep = []
|
||||
@@ -1022,7 +1034,7 @@ class AnimateDiffControlNetPipeline(
|
||||
]
|
||||
controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
|
||||
|
||||
# 8. Denoising loop
|
||||
# Denoising loop
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
@@ -1098,17 +1110,21 @@ class AnimateDiffControlNetPipeline(
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
# 9. Post processing
|
||||
if output_type == "latent":
|
||||
video = latents
|
||||
return AnimateDiffControlNetPipelineOutput(frames=latents)
|
||||
|
||||
# Post-processing
|
||||
video_tensor = self.decode_latents(latents)
|
||||
|
||||
if output_type == "pt":
|
||||
video = video_tensor
|
||||
else:
|
||||
video_tensor = self.decode_latents(latents)
|
||||
video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
|
||||
|
||||
# 10. Offload all models
|
||||
# Offload all models
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (video,)
|
||||
|
||||
return AnimateDiffPipelineOutput(frames=video)
|
||||
return AnimateDiffControlNetPipelineOutput(frames=video)
|
||||
|
||||
@@ -11,14 +11,9 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Note:
|
||||
# This pipeline relies on a "hack" discovered by the community that allows
|
||||
# the generation of videos given an input image with AnimateDiff. It works
|
||||
# by creating a copy of the image `num_frames` times and progressively adding
|
||||
# more noise to the image based on the strength and latent interpolation method.
|
||||
|
||||
import inspect
|
||||
from dataclasses import dataclass
|
||||
from types import FunctionType
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
@@ -31,8 +26,7 @@ from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionL
|
||||
from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
|
||||
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
||||
from diffusers.models.unet_motion_model import MotionAdapter
|
||||
from diffusers.pipelines.animatediff.pipeline_output import AnimateDiffPipelineOutput
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.schedulers import (
|
||||
DDIMScheduler,
|
||||
DPMSolverMultistepScheduler,
|
||||
@@ -41,7 +35,7 @@ from diffusers.schedulers import (
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
)
|
||||
from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
|
||||
from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
|
||||
@@ -54,10 +48,9 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
|
||||
>>> from diffusers.utils import export_to_gif, load_image
|
||||
|
||||
>>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
|
||||
>>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
|
||||
>>> pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
|
||||
>>> pipe.scheduler = pipe.scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", beta_schedule="linear", steps_offset=1)
|
||||
>>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
|
||||
|
||||
>>> image = load_image("snail.png")
|
||||
>>> output = pipe(image=image, prompt="A snail moving on the ground", strength=0.8, latent_interpolation_method="slerp")
|
||||
@@ -158,8 +151,10 @@ def slerp(
|
||||
return v2
|
||||
|
||||
|
||||
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
|
||||
def tensor2vid(video: torch.Tensor, processor, output_type="np"):
|
||||
# Based on:
|
||||
# https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
|
||||
|
||||
batch_size, channels, num_frames, height, width = video.shape
|
||||
outputs = []
|
||||
for batch_idx in range(batch_size):
|
||||
@@ -168,15 +163,6 @@ def tensor2vid(video: torch.Tensor, processor, output_type="np"):
|
||||
|
||||
outputs.append(batch_output)
|
||||
|
||||
if output_type == "np":
|
||||
outputs = np.stack(outputs)
|
||||
|
||||
elif output_type == "pt":
|
||||
outputs = torch.stack(outputs)
|
||||
|
||||
elif not output_type == "pil":
|
||||
raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
@@ -239,11 +225,14 @@ def retrieve_timesteps(
|
||||
return timesteps, num_inference_steps
|
||||
|
||||
|
||||
class AnimateDiffImgToVideoPipeline(
|
||||
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin
|
||||
):
|
||||
@dataclass
|
||||
class AnimateDiffImgToVideoPipelineOutput(BaseOutput):
|
||||
frames: Union[torch.Tensor, np.ndarray]
|
||||
|
||||
|
||||
class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
|
||||
r"""
|
||||
Pipeline for image-to-video generation.
|
||||
Pipeline for text-to-video generation.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
||||
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
|
||||
@@ -514,41 +503,6 @@ class AnimateDiffImgToVideoPipeline(
|
||||
|
||||
return image_embeds, uncond_image_embeds
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
|
||||
def prepare_ip_adapter_image_embeds(
|
||||
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
|
||||
):
|
||||
if ip_adapter_image_embeds is None:
|
||||
if not isinstance(ip_adapter_image, list):
|
||||
ip_adapter_image = [ip_adapter_image]
|
||||
|
||||
if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
|
||||
raise ValueError(
|
||||
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
||||
)
|
||||
|
||||
image_embeds = []
|
||||
for single_ip_adapter_image, image_proj_layer in zip(
|
||||
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
||||
):
|
||||
output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
|
||||
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
||||
single_ip_adapter_image, device, 1, output_hidden_state
|
||||
)
|
||||
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
||||
single_negative_image_embeds = torch.stack(
|
||||
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
||||
)
|
||||
|
||||
if self.do_classifier_free_guidance:
|
||||
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
||||
single_image_embeds = single_image_embeds.to(device)
|
||||
|
||||
image_embeds.append(single_image_embeds)
|
||||
else:
|
||||
image_embeds = ip_adapter_image_embeds
|
||||
return image_embeds
|
||||
|
||||
# Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
|
||||
def decode_latents(self, latents):
|
||||
latents = 1 / self.vae.config.scaling_factor * latents
|
||||
@@ -573,6 +527,67 @@ class AnimateDiffImgToVideoPipeline(
|
||||
video = video.float()
|
||||
return video
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
|
||||
def enable_vae_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
||||
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.vae.enable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
|
||||
def disable_vae_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
|
||||
def enable_vae_tiling(self):
|
||||
r"""
|
||||
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
||||
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
"""
|
||||
self.vae.enable_tiling()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
|
||||
def disable_vae_tiling(self):
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_tiling()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
|
||||
def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
|
||||
r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
|
||||
|
||||
The suffixes after the scaling factors represent the stages where they are being applied.
|
||||
|
||||
Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
|
||||
that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
|
||||
|
||||
Args:
|
||||
s1 (`float`):
|
||||
Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
s2 (`float`):
|
||||
Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
|
||||
mitigate "oversmoothing effect" in the enhanced denoising process.
|
||||
b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
|
||||
b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
|
||||
"""
|
||||
if not hasattr(self, "unet"):
|
||||
raise ValueError("The pipeline must have `unet` for using FreeU.")
|
||||
self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
|
||||
def disable_freeu(self):
|
||||
"""Disables the FreeU mechanism if enabled."""
|
||||
self.unet.disable_freeu()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
|
||||
def prepare_extra_step_kwargs(self, generator, eta):
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
@@ -750,7 +765,6 @@ class AnimateDiffImgToVideoPipeline(
|
||||
prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
||||
ip_adapter_image: Optional[PipelineImageInput] = None,
|
||||
ip_adapter_image_embeds: Optional[PipelineImageInput] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
@@ -804,11 +818,6 @@ class AnimateDiffImgToVideoPipeline(
|
||||
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
||||
Optional image input to work with IP Adapters.
|
||||
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
||||
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
|
||||
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
|
||||
if `do_classifier_free_guidance` is set to `True`.
|
||||
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
|
||||
`np.array`.
|
||||
@@ -833,8 +842,8 @@ class AnimateDiffImgToVideoPipeline(
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
[`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
|
||||
[`AnimateDiffImgToVideoPipelineOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`AnimateDiffImgToVideoPipelineOutput`] is
|
||||
returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
@@ -893,9 +902,12 @@ class AnimateDiffImgToVideoPipeline(
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
||||
|
||||
if ip_adapter_image is not None:
|
||||
image_embeds = self.prepare_ip_adapter_image_embeds(
|
||||
ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt
|
||||
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
|
||||
image_embeds, negative_image_embeds = self.encode_image(
|
||||
ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
|
||||
)
|
||||
if do_classifier_free_guidance:
|
||||
image_embeds = torch.cat([negative_image_embeds, image_embeds])
|
||||
|
||||
# 4. Preprocess image
|
||||
image = self.image_processor.preprocess(image, height=height, width=width)
|
||||
@@ -924,11 +936,7 @@ class AnimateDiffImgToVideoPipeline(
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
# 8. Add image embeds for IP-Adapter
|
||||
added_cond_kwargs = (
|
||||
{"image_embeds": image_embeds}
|
||||
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
|
||||
else None
|
||||
)
|
||||
added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
|
||||
|
||||
# 9. Denoising loop
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
@@ -962,13 +970,14 @@ class AnimateDiffImgToVideoPipeline(
|
||||
callback(i, t, latents)
|
||||
|
||||
if output_type == "latent":
|
||||
return AnimateDiffPipelineOutput(frames=latents)
|
||||
return AnimateDiffImgToVideoPipelineOutput(frames=latents)
|
||||
|
||||
# 10. Post-processing
|
||||
if output_type == "latent":
|
||||
video = latents
|
||||
video_tensor = self.decode_latents(latents)
|
||||
|
||||
if output_type == "pt":
|
||||
video = video_tensor
|
||||
else:
|
||||
video_tensor = self.decode_latents(latents)
|
||||
video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
|
||||
|
||||
# 11. Offload all models
|
||||
@@ -977,4 +986,4 @@ class AnimateDiffImgToVideoPipeline(
|
||||
if not return_dict:
|
||||
return (video,)
|
||||
|
||||
return AnimateDiffPipelineOutput(frames=video)
|
||||
return AnimateDiffImgToVideoPipelineOutput(frames=video)
|
||||
|
||||
@@ -23,7 +23,7 @@ from diffusers.models.attention_processor import (
|
||||
XFormersAttnProcessor,
|
||||
)
|
||||
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.schedulers import KarrasDiffusionSchedulers
|
||||
from diffusers.utils import (
|
||||
is_accelerate_available,
|
||||
@@ -93,9 +93,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
||||
return noise_cfg
|
||||
|
||||
|
||||
class DemoFusionSDXLPipeline(
|
||||
DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||
):
|
||||
class DemoFusionSDXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion XL.
|
||||
|
||||
@@ -178,6 +176,39 @@ class DemoFusionSDXLPipeline(
|
||||
else:
|
||||
self.watermark = None
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
|
||||
def enable_vae_slicing(self):
|
||||
r"""
|
||||
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
||||
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.vae.enable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
|
||||
def disable_vae_slicing(self):
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_slicing()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
|
||||
def enable_vae_tiling(self):
|
||||
r"""
|
||||
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
||||
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
"""
|
||||
self.vae.enable_tiling()
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
|
||||
def disable_vae_tiling(self):
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
|
||||
computing decoding in one step.
|
||||
"""
|
||||
self.vae.disable_tiling()
|
||||
|
||||
def encode_prompt(
|
||||
self,
|
||||
prompt: str,
|
||||
@@ -258,7 +289,7 @@ class DemoFusionSDXLPipeline(
|
||||
|
||||
if prompt_embeds is None:
|
||||
prompt_2 = prompt_2 or prompt
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
prompt_embeds_list = []
|
||||
prompts = [prompt, prompt_2]
|
||||
for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
|
||||
|
||||
@@ -233,7 +233,7 @@ class FabricPipeline(DiffusionPipeline):
|
||||
batch_size = prompt_embeds.shape[0]
|
||||
|
||||
if prompt_embeds is None:
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
if isinstance(self, TextualInversionLoaderMixin):
|
||||
prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
|
||||
|
||||
@@ -304,7 +304,7 @@ class FabricPipeline(DiffusionPipeline):
|
||||
else:
|
||||
uncond_tokens = negative_prompt
|
||||
|
||||
# textual inversion: process multi-vector tokens if necessary
|
||||
# textual inversion: procecss multi-vector tokens if necessary
|
||||
if isinstance(self, TextualInversionLoaderMixin):
|
||||
uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user