mirror of
https://github.com/huggingface/diffusers.git
synced 2025-12-07 13:04:15 +08:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3538e03a3f | ||
|
|
d5a0f0ddd2 |
146
.github/actions/setup-miniconda/action.yml
vendored
146
.github/actions/setup-miniconda/action.yml
vendored
@@ -1,146 +0,0 @@
|
||||
name: Set up conda environment for testing
|
||||
|
||||
description: Sets up miniconda in your ${RUNNER_TEMP} environment and gives you the ${CONDA_RUN} environment variable so you don't have to worry about polluting non-empeheral runners anymore
|
||||
|
||||
inputs:
|
||||
python-version:
|
||||
description: If set to any value, dont use sudo to clean the workspace
|
||||
required: false
|
||||
type: string
|
||||
default: "3.9"
|
||||
miniconda-version:
|
||||
description: Miniconda version to install
|
||||
required: false
|
||||
type: string
|
||||
default: "4.12.0"
|
||||
environment-file:
|
||||
description: Environment file to install dependencies from
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
# Use the same trick from https://github.com/marketplace/actions/setup-miniconda
|
||||
# to refresh the cache daily. This is kind of optional though
|
||||
- name: Get date
|
||||
id: get-date
|
||||
shell: bash
|
||||
run: echo "::set-output name=today::$(/bin/date -u '+%Y%m%d')d"
|
||||
- name: Setup miniconda cache
|
||||
id: miniconda-cache
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ runner.temp }}/miniconda
|
||||
key: miniconda-${{ runner.os }}-${{ runner.arch }}-${{ inputs.python-version }}-${{ steps.get-date.outputs.today }}
|
||||
- name: Install miniconda (${{ inputs.miniconda-version }})
|
||||
if: steps.miniconda-cache.outputs.cache-hit != 'true'
|
||||
env:
|
||||
MINICONDA_VERSION: ${{ inputs.miniconda-version }}
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
MINICONDA_INSTALL_PATH="${RUNNER_TEMP}/miniconda"
|
||||
mkdir -p "${MINICONDA_INSTALL_PATH}"
|
||||
case ${RUNNER_OS}-${RUNNER_ARCH} in
|
||||
Linux-X64)
|
||||
MINICONDA_ARCH="Linux-x86_64"
|
||||
;;
|
||||
macOS-ARM64)
|
||||
MINICONDA_ARCH="MacOSX-arm64"
|
||||
;;
|
||||
macOS-X64)
|
||||
MINICONDA_ARCH="MacOSX-x86_64"
|
||||
;;
|
||||
*)
|
||||
echo "::error::Platform ${RUNNER_OS}-${RUNNER_ARCH} currently unsupported using this action"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py39_${MINICONDA_VERSION}-${MINICONDA_ARCH}.sh"
|
||||
curl -fsSL "${MINICONDA_URL}" -o "${MINICONDA_INSTALL_PATH}/miniconda.sh"
|
||||
bash "${MINICONDA_INSTALL_PATH}/miniconda.sh" -b -u -p "${MINICONDA_INSTALL_PATH}"
|
||||
rm -rf "${MINICONDA_INSTALL_PATH}/miniconda.sh"
|
||||
- name: Update GitHub path to include miniconda install
|
||||
shell: bash
|
||||
run: |
|
||||
MINICONDA_INSTALL_PATH="${RUNNER_TEMP}/miniconda"
|
||||
echo "${MINICONDA_INSTALL_PATH}/bin" >> $GITHUB_PATH
|
||||
- name: Setup miniconda env cache (with env file)
|
||||
id: miniconda-env-cache-env-file
|
||||
if: ${{ runner.os }} == 'macOS' && ${{ inputs.environment-file }} != ''
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ runner.temp }}/conda-python-${{ inputs.python-version }}
|
||||
key: miniconda-env-${{ runner.os }}-${{ runner.arch }}-${{ inputs.python-version }}-${{ steps.get-date.outputs.today }}-${{ hashFiles(inputs.environment-file) }}
|
||||
- name: Setup miniconda env cache (without env file)
|
||||
id: miniconda-env-cache
|
||||
if: ${{ runner.os }} == 'macOS' && ${{ inputs.environment-file }} == ''
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ runner.temp }}/conda-python-${{ inputs.python-version }}
|
||||
key: miniconda-env-${{ runner.os }}-${{ runner.arch }}-${{ inputs.python-version }}-${{ steps.get-date.outputs.today }}
|
||||
- name: Setup conda environment with python (v${{ inputs.python-version }})
|
||||
if: steps.miniconda-env-cache-env-file.outputs.cache-hit != 'true' && steps.miniconda-env-cache.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
ENV_FILE: ${{ inputs.environment-file }}
|
||||
run: |
|
||||
CONDA_BASE_ENV="${RUNNER_TEMP}/conda-python-${PYTHON_VERSION}"
|
||||
ENV_FILE_FLAG=""
|
||||
if [[ -f "${ENV_FILE}" ]]; then
|
||||
ENV_FILE_FLAG="--file ${ENV_FILE}"
|
||||
elif [[ -n "${ENV_FILE}" ]]; then
|
||||
echo "::warning::Specified env file (${ENV_FILE}) not found, not going to include it"
|
||||
fi
|
||||
conda create \
|
||||
--yes \
|
||||
--prefix "${CONDA_BASE_ENV}" \
|
||||
"python=${PYTHON_VERSION}" \
|
||||
${ENV_FILE_FLAG} \
|
||||
cmake=3.22 \
|
||||
conda-build=3.21 \
|
||||
ninja=1.10 \
|
||||
pkg-config=0.29 \
|
||||
wheel=0.37
|
||||
- name: Clone the base conda environment and update GitHub env
|
||||
shell: bash
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
CONDA_BASE_ENV: ${{ runner.temp }}/conda-python-${{ inputs.python-version }}
|
||||
run: |
|
||||
CONDA_ENV="${RUNNER_TEMP}/conda_environment_${GITHUB_RUN_ID}"
|
||||
conda create \
|
||||
--yes \
|
||||
--prefix "${CONDA_ENV}" \
|
||||
--clone "${CONDA_BASE_ENV}"
|
||||
# TODO: conda-build could not be cloned because it hardcodes the path, so it
|
||||
# could not be cached
|
||||
conda install --yes -p ${CONDA_ENV} conda-build=3.21
|
||||
echo "CONDA_ENV=${CONDA_ENV}" >> "${GITHUB_ENV}"
|
||||
echo "CONDA_RUN=conda run -p ${CONDA_ENV} --no-capture-output" >> "${GITHUB_ENV}"
|
||||
echo "CONDA_BUILD=conda run -p ${CONDA_ENV} conda-build" >> "${GITHUB_ENV}"
|
||||
echo "CONDA_INSTALL=conda install -p ${CONDA_ENV}" >> "${GITHUB_ENV}"
|
||||
- name: Get disk space usage and throw an error for low disk space
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Print the available disk space for manual inspection"
|
||||
df -h
|
||||
# Set the minimum requirement space to 4GB
|
||||
MINIMUM_AVAILABLE_SPACE_IN_GB=4
|
||||
MINIMUM_AVAILABLE_SPACE_IN_KB=$(($MINIMUM_AVAILABLE_SPACE_IN_GB * 1024 * 1024))
|
||||
# Use KB to avoid floating point warning like 3.1GB
|
||||
df -k | tr -s ' ' | cut -d' ' -f 4,9 | while read -r LINE;
|
||||
do
|
||||
AVAIL=$(echo $LINE | cut -f1 -d' ')
|
||||
MOUNT=$(echo $LINE | cut -f2 -d' ')
|
||||
if [ "$MOUNT" = "/" ]; then
|
||||
if [ "$AVAIL" -lt "$MINIMUM_AVAILABLE_SPACE_IN_KB" ]; then
|
||||
echo "There is only ${AVAIL}KB free space left in $MOUNT, which is less than the minimum requirement of ${MINIMUM_AVAILABLE_SPACE_IN_KB}KB. Please help create an issue to PyTorch Release Engineering via https://github.com/pytorch/test-infra/issues and provide the link to the workflow run."
|
||||
exit 1;
|
||||
else
|
||||
echo "There is ${AVAIL}KB free space left in $MOUNT, continue"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
50
.github/workflows/build_docker_images.yml
vendored
50
.github/workflows/build_docker_images.yml
vendored
@@ -1,50 +0,0 @@
|
||||
name: Build Docker images (nightly)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 0 * * *" # every day at midnight
|
||||
|
||||
concurrency:
|
||||
group: docker-image-builds
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
REGISTRY: diffusers
|
||||
|
||||
jobs:
|
||||
build-docker-images:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
image-name:
|
||||
- diffusers-pytorch-cpu
|
||||
- diffusers-pytorch-cuda
|
||||
- diffusers-flax-cpu
|
||||
- diffusers-flax-tpu
|
||||
- diffusers-onnxruntime-cpu
|
||||
- diffusers-onnxruntime-cuda
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ env.REGISTRY }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
no-cache: true
|
||||
context: ./docker/${{ matrix.image-name }}
|
||||
push: true
|
||||
tags: ${{ env.REGISTRY }}/${{ matrix.image-name }}:latest
|
||||
17
.github/workflows/pr_quality.yml
vendored
17
.github/workflows/pr_quality.yml
vendored
@@ -31,20 +31,3 @@ jobs:
|
||||
isort --check-only examples tests src utils scripts
|
||||
flake8 examples tests src utils scripts
|
||||
doc-builder style src/diffusers docs/source --max_len 119 --check_only --path_to_docs docs/source
|
||||
|
||||
check_repository_consistency:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.7"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[quality]
|
||||
- name: Check quality
|
||||
run: |
|
||||
python utils/check_copies.py
|
||||
python utils/check_dummies.py
|
||||
|
||||
123
.github/workflows/pr_tests.yml
vendored
123
.github/workflows/pr_tests.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Run fast tests
|
||||
name: Run non-slow tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
@@ -10,46 +10,19 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 4
|
||||
MKL_NUM_THREADS: 4
|
||||
HF_HOME: /mnt/cache
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
PYTEST_TIMEOUT: 60
|
||||
MPS_TORCH_VERSION: 1.13.0
|
||||
|
||||
jobs:
|
||||
run_fast_tests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- name: Fast PyTorch CPU tests on Ubuntu
|
||||
framework: pytorch
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_cpu
|
||||
- name: Fast Flax CPU tests on Ubuntu
|
||||
framework: flax
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-flax-cpu
|
||||
report: flax_cpu
|
||||
- name: Fast ONNXRuntime CPU tests on Ubuntu
|
||||
framework: onnxruntime
|
||||
runner: docker-cpu
|
||||
image: diffusers/diffusers-onnxruntime-cpu
|
||||
report: onnx_cpu
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
|
||||
run_tests_cpu:
|
||||
name: Diffusers tests
|
||||
runs-on: [ self-hosted, docker-gpu ]
|
||||
container:
|
||||
image: ${{ matrix.config.image }}
|
||||
image: python:3.7
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
@@ -58,93 +31,25 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch CPU tests
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
- name: Run all non-slow selected tests on CPU
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run fast Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run fast ONNXRuntime CPU tests
|
||||
if: ${{ matrix.config.framework == 'onnxruntime' }}
|
||||
run: |
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_cpu tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
|
||||
run: cat reports/tests_torch_cpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pr_${{ matrix.config.report }}_test_reports
|
||||
path: reports
|
||||
|
||||
run_fast_tests_apple_m1:
|
||||
name: Fast PyTorch MPS tests on MacOS
|
||||
runs-on: [ self-hosted, apple-m1 ]
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Clean checkout
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
git clean -fxd
|
||||
|
||||
- name: Setup miniconda
|
||||
uses: ./.github/actions/setup-miniconda
|
||||
with:
|
||||
python-version: 3.9
|
||||
|
||||
- name: Install dependencies
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pip install --upgrade pip
|
||||
${CONDA_RUN} python -m pip install -e .[quality,test]
|
||||
${CONDA_RUN} python -m pip install --pre torch==${MPS_TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/test/cpu
|
||||
${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
|
||||
|
||||
- name: Environment
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python utils/print_env.py
|
||||
|
||||
- name: Run fast PyTorch tests on M1 (MPS)
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_mps_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: pr_torch_mps_test_reports
|
||||
name: pr_torch_test_reports
|
||||
path: reports
|
||||
|
||||
92
.github/workflows/push_tests.yml
vendored
92
.github/workflows/push_tests.yml
vendored
@@ -6,7 +6,6 @@ on:
|
||||
- main
|
||||
|
||||
env:
|
||||
DIFFUSERS_IS_CI: yes
|
||||
HF_HOME: /mnt/cache
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
@@ -14,38 +13,12 @@ env:
|
||||
RUN_SLOW: yes
|
||||
|
||||
jobs:
|
||||
run_slow_tests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- name: Slow PyTorch CUDA tests on Ubuntu
|
||||
framework: pytorch
|
||||
runner: docker-gpu
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
report: torch_cuda
|
||||
- name: Slow Flax TPU tests on Ubuntu
|
||||
framework: flax
|
||||
runner: docker-tpu
|
||||
image: diffusers/diffusers-flax-tpu
|
||||
report: flax_tpu
|
||||
- name: Slow ONNXRuntime CUDA tests on Ubuntu
|
||||
framework: onnxruntime
|
||||
runner: docker-gpu
|
||||
image: diffusers/diffusers-onnxruntime-cuda
|
||||
report: onnx_cuda
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
|
||||
run_tests_single_gpu:
|
||||
name: Diffusers tests
|
||||
runs-on: [ self-hosted, docker-gpu, single-gpu ]
|
||||
container:
|
||||
image: ${{ matrix.config.image }}
|
||||
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
image: nvcr.io/nvidia/pytorch:22.07-py3
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -54,68 +27,45 @@ jobs:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
if : ${{ matrix.config.runner == 'docker-gpu' }}
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip uninstall -y torch torchvision torchtext
|
||||
python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu116
|
||||
python -m pip install -e .[quality,test]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run slow PyTorch CUDA tests
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
- name: Run all (incl. slow) tests on GPU
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run slow Flax TPU tests
|
||||
if: ${{ matrix.config.framework == 'flax' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 0 \
|
||||
-s -v -k "Flax" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
- name: Run slow ONNXRuntime CUDA tests
|
||||
if: ${{ matrix.config.framework == 'onnxruntime' }}
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_gpu tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
|
||||
run: cat reports/tests_torch_gpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ matrix.config.report }}_test_reports
|
||||
name: torch_test_reports
|
||||
path: reports
|
||||
|
||||
run_examples_tests:
|
||||
name: Examples PyTorch CUDA tests on Ubuntu
|
||||
|
||||
runs-on: docker-gpu
|
||||
|
||||
run_examples_single_gpu:
|
||||
name: Examples tests
|
||||
runs-on: [ self-hosted, docker-gpu, single-gpu ]
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
|
||||
image: nvcr.io/nvidia/pytorch:22.07-py3
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache
|
||||
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
@@ -129,8 +79,10 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip uninstall -y torch torchvision torchtext
|
||||
python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu116
|
||||
python -m pip install -e .[quality,test,training]
|
||||
python -m pip install git+https://github.com/huggingface/accelerate
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -140,11 +92,11 @@ jobs:
|
||||
env:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
run: |
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_gpu examples/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/examples_torch_cuda_failures_short.txt
|
||||
run: cat reports/examples_torch_gpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
|
||||
1
Makefile
1
Makefile
@@ -67,7 +67,6 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency
|
||||
# Make marked copies of snippets of codes conform to the original
|
||||
|
||||
fix-copies:
|
||||
python utils/check_copies.py --fix_and_overwrite
|
||||
python utils/check_dummies.py --fix_and_overwrite
|
||||
|
||||
# Run tests for the library
|
||||
|
||||
181
README.md
181
README.md
@@ -27,12 +27,10 @@ More precisely, 🤗 Diffusers offers:
|
||||
|
||||
## Installation
|
||||
|
||||
### For PyTorch
|
||||
|
||||
**With `pip`**
|
||||
|
||||
```bash
|
||||
pip install --upgrade diffusers[torch]
|
||||
pip install --upgrade diffusers
|
||||
```
|
||||
|
||||
**With `conda`**
|
||||
@@ -41,14 +39,6 @@ pip install --upgrade diffusers[torch]
|
||||
conda install -c conda-forge diffusers
|
||||
```
|
||||
|
||||
### For Flax
|
||||
|
||||
**With `pip`**
|
||||
|
||||
```bash
|
||||
pip install --upgrade diffusers[flax]
|
||||
```
|
||||
|
||||
**Apple Silicon (M1/M2) support**
|
||||
|
||||
Please, refer to [the documentation](https://huggingface.co/docs/diffusers/optimization/mps).
|
||||
@@ -74,55 +64,44 @@ In order to get started, we recommend taking a look at two notebooks:
|
||||
- The [Training a diffusers model](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) notebook summarizes diffusion models training methods. This notebook takes a step-by-step approach to training your
|
||||
diffusion models on an image dataset, with explanatory graphics.
|
||||
|
||||
## Stable Diffusion is fully compatible with `diffusers`!
|
||||
## **New** Stable Diffusion is now fully compatible with `diffusers`!
|
||||
|
||||
Stable Diffusion is a text-to-image latent diffusion model created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [LAION](https://laion.ai/) and [RunwayML](https://runwayml.com/). It's trained on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. This model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts. With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 4GB VRAM.
|
||||
Stable Diffusion is a text-to-image latent diffusion model created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/). It's trained on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. This model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts. With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 10GB VRAM.
|
||||
See the [model card](https://huggingface.co/CompVis/stable-diffusion) for more information.
|
||||
|
||||
You need to accept the model license before downloading or using the Stable Diffusion weights. Please, visit the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license carefully and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section](https://huggingface.co/docs/hub/security-tokens) of the documentation.
|
||||
You need to accept the model license before downloading or using the Stable Diffusion weights. Please, visit the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section](https://huggingface.co/docs/hub/security-tokens) of the documentation.
|
||||
|
||||
|
||||
### Text-to-Image generation with Stable Diffusion
|
||||
|
||||
First let's install
|
||||
```bash
|
||||
pip install --upgrade diffusers transformers scipy
|
||||
```
|
||||
|
||||
Run this command to log in with your HF Hub token if you haven't before (you can skip this step if you prefer to run the model locally, follow [this](#running-the-model-locally) instead)
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
We recommend using the model in [half-precision (`fp16`)](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/) as it gives almost always the same results as full
|
||||
precision while being roughly twice as fast and requiring half the amount of GPU RAM.
|
||||
|
||||
```python
|
||||
import torch
|
||||
# make sure you're logged in with `huggingface-cli login`
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, revision="fp16")
|
||||
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_type=torch.float16, revision="fp16")
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
#### Running the model locally
|
||||
If you don't want to login to Hugging Face, you can also simply download the model folder
|
||||
(after having [accepted the license](https://huggingface.co/runwayml/stable-diffusion-v1-5)) and pass
|
||||
**Note**: If you don't want to use the token, you can also simply download the model weights
|
||||
(after having [accepted the license](https://huggingface.co/CompVis/stable-diffusion-v1-4)) and pass
|
||||
the path to the local folder to the `StableDiffusionPipeline`.
|
||||
|
||||
```
|
||||
git lfs install
|
||||
git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
|
||||
git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
|
||||
```
|
||||
|
||||
Assuming the folder is stored locally under `./stable-diffusion-v1-5`, you can also run stable diffusion
|
||||
Assuming the folder is stored locally under `./stable-diffusion-v1-4`, you can also run stable diffusion
|
||||
without requiring an authentication token:
|
||||
|
||||
```python
|
||||
pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
|
||||
pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-4")
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
@@ -135,7 +114,7 @@ The following snippet should result in less than 4GB VRAM.
|
||||
|
||||
```python
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
@@ -146,16 +125,20 @@ pipe.enable_attention_slicing()
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
If you wish to use a different scheduler (e.g.: DDIM, LMS, PNDM/PLMS), you can instantiate
|
||||
If you wish to use a different scheduler, you can simply instantiate
|
||||
it before the pipeline and pass it to `from_pretrained`.
|
||||
|
||||
```python
|
||||
from diffusers import LMSDiscreteScheduler
|
||||
|
||||
lms = LMSDiscreteScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
|
||||
lms = LMSDiscreteScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear"
|
||||
)
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
scheduler=lms,
|
||||
@@ -175,7 +158,7 @@ please run the model in the default *full-precision* setting:
|
||||
# make sure you're logged in with `huggingface-cli login`
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
|
||||
# disable the following line if you run on CPU
|
||||
pipe = pipe.to("cuda")
|
||||
@@ -186,75 +169,6 @@ image = pipe(prompt).images[0]
|
||||
image.save("astronaut_rides_horse.png")
|
||||
```
|
||||
|
||||
### JAX/Flax
|
||||
|
||||
Diffusers offers a JAX / Flax implementation of Stable Diffusion for very fast inference. JAX shines specially on TPU hardware because each TPU server has 8 accelerators working in parallel, but it runs great on GPUs too.
|
||||
|
||||
Running the pipeline with the default PNDMScheduler:
|
||||
|
||||
```python
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
|
||||
from diffusers import FlaxStableDiffusionPipeline
|
||||
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", revision="flax", dtype=jax.numpy.bfloat16
|
||||
)
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 50
|
||||
|
||||
num_samples = jax.device_count()
|
||||
prompt = num_samples * [prompt]
|
||||
prompt_ids = pipeline.prepare_inputs(prompt)
|
||||
|
||||
# shard inputs and rng
|
||||
params = replicate(params)
|
||||
prng_seed = jax.random.split(prng_seed, jax.device_count())
|
||||
prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
```
|
||||
|
||||
**Note**:
|
||||
If you are limited by TPU memory, please make sure to load the `FlaxStableDiffusionPipeline` in `bfloat16` precision instead of the default `float32` precision as done above. You can do so by telling diffusers to load the weights from "bf16" branch.
|
||||
|
||||
```python
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
|
||||
from diffusers import FlaxStableDiffusionPipeline
|
||||
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", revision="bf16", dtype=jax.numpy.bfloat16
|
||||
)
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 50
|
||||
|
||||
num_samples = jax.device_count()
|
||||
prompt = num_samples * [prompt]
|
||||
prompt_ids = pipeline.prepare_inputs(prompt)
|
||||
|
||||
# shard inputs and rng
|
||||
params = replicate(params)
|
||||
prng_seed = jax.random.split(prng_seed, jax.device_count())
|
||||
prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
```
|
||||
|
||||
### Image-to-Image text-guided generation with Stable Diffusion
|
||||
|
||||
The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
|
||||
@@ -269,14 +183,14 @@ from diffusers import StableDiffusionImg2ImgPipeline
|
||||
|
||||
# load the pipeline
|
||||
device = "cuda"
|
||||
model_id_or_path = "runwayml/stable-diffusion-v1-5"
|
||||
model_id_or_path = "CompVis/stable-diffusion-v1-4"
|
||||
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
|
||||
model_id_or_path,
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
# or download via git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
|
||||
# and pass `model_id_or_path="./stable-diffusion-v1-5"`.
|
||||
# or download via git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
|
||||
# and pass `model_id_or_path="./stable-diffusion-v1-4"`.
|
||||
pipe = pipe.to(device)
|
||||
|
||||
# let's download an initial image
|
||||
@@ -296,17 +210,15 @@ You can also run this example on colab [, read the license carefully and tick the checkbox if you agree. Note that this is an additional license, you need to accept it even if you accepted the text-to-image Stable Diffusion license in the past. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section](https://huggingface.co/docs/hub/security-tokens) of the documentation.
|
||||
|
||||
The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and text prompt.
|
||||
|
||||
```python
|
||||
import PIL
|
||||
import requests
|
||||
import torch
|
||||
from io import BytesIO
|
||||
|
||||
import torch
|
||||
import requests
|
||||
import PIL
|
||||
|
||||
from diffusers import StableDiffusionInpaintPipeline
|
||||
|
||||
def download_image(url):
|
||||
@@ -319,15 +231,21 @@ mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data
|
||||
init_image = download_image(img_url).resize((512, 512))
|
||||
mask_image = download_image(mask_url).resize((512, 512))
|
||||
|
||||
device = "cuda"
|
||||
model_id_or_path = "CompVis/stable-diffusion-v1-4"
|
||||
pipe = StableDiffusionInpaintPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting",
|
||||
model_id_or_path,
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
# or download via git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
|
||||
# and pass `model_id_or_path="./stable-diffusion-v1-4"`.
|
||||
pipe = pipe.to(device)
|
||||
|
||||
prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
|
||||
image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
|
||||
prompt = "a cat sitting on a bench"
|
||||
images = pipe(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
|
||||
images[0].save("cat_on_bench.png")
|
||||
```
|
||||
|
||||
### Tweak prompts reusing seeds and latents
|
||||
@@ -338,24 +256,7 @@ You can generate your own latents to reproduce results, or tweak your prompt on
|
||||
For more details, check out [the Stable Diffusion notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb)
|
||||
and have a look into the [release notes](https://github.com/huggingface/diffusers/releases/tag/v0.2.0).
|
||||
|
||||
## Fine-Tuning Stable Diffusion
|
||||
|
||||
Fine-tuning techniques make it possible to adapt Stable Diffusion to your own dataset, or add new subjects to it. These are some of the techniques supported in `diffusers`:
|
||||
|
||||
Textual Inversion is a technique for capturing novel concepts from a small number of example images in a way that can later be used to control text-to-image pipelines. It does so by learning new 'words' in the embedding space of the pipeline's text encoder. These special words can then be used within text prompts to achieve very fine-grained control of the resulting images.
|
||||
|
||||
- Textual Inversion. Capture novel concepts from a small set of sample images, and associate them with new "words" in the embedding space of the text encoder. Please, refer to [our training examples](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) or [documentation](https://huggingface.co/docs/diffusers/training/text_inversion) to try for yourself.
|
||||
|
||||
- Dreambooth. Another technique to capture new concepts in Stable Diffusion. This method fine-tunes the UNet (and, optionally, also the text encoder) of the pipeline to achieve impressive results. Please, refer to [our training examples](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) and [training report](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) for additional details and training recommendations.
|
||||
|
||||
- Full Stable Diffusion fine-tuning. If you have a more sizable dataset with a specific look or style, you can fine-tune Stable Diffusion so that it outputs images following those examples. This was the approach taken to create [a Pokémon Stable Diffusion model](https://huggingface.co/justinpinkney/pokemon-stable-diffusion) (by Justing Pinkney / Lambda Labs), [a Japanese specific version of Stable Diffusion](https://huggingface.co/spaces/rinna/japanese-stable-diffusion) (by [Rinna Co.](https://github.com/rinnakk/japanese-stable-diffusion/) and others. You can start at [our text-to-image fine-tuning example](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) and go from there.
|
||||
|
||||
|
||||
## Stable Diffusion Community Pipelines
|
||||
|
||||
The release of Stable Diffusion as an open source model has fostered a lot of interesting ideas and experimentation. Our [Community Examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community) contains many ideas worth exploring, like interpolating to create animated videos, using CLIP Guidance for additional prompt fidelity, term weighting, and much more! Take a look and [contribute your own](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipelines).
|
||||
|
||||
## Other Examples
|
||||
## Examples
|
||||
|
||||
There are many ways to try running Diffusers! Here we outline code-focused tools (primarily using `DiffusionPipeline`s and Google Colab) and interactive web-tools.
|
||||
|
||||
@@ -364,7 +265,7 @@ There are many ways to try running Diffusers! Here we outline code-focused tools
|
||||
If you want to run the code yourself 💻, you can try out:
|
||||
- [Text-to-Image Latent Diffusion](https://huggingface.co/CompVis/ldm-text2im-large-256)
|
||||
```python
|
||||
# !pip install diffusers["torch"] transformers
|
||||
# !pip install diffusers transformers
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
device = "cuda"
|
||||
@@ -383,7 +284,7 @@ image.save("squirrel.png")
|
||||
```
|
||||
- [Unconditional Diffusion with discrete scheduler](https://huggingface.co/google/ddpm-celebahq-256)
|
||||
```python
|
||||
# !pip install diffusers["torch"]
|
||||
# !pip install diffusers
|
||||
from diffusers import DDPMPipeline, DDIMPipeline, PNDMPipeline
|
||||
|
||||
model_id = "google/ddpm-celebahq-256"
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
FROM ubuntu:20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --upgrade --no-cache-dir \
|
||||
clu \
|
||||
"jax[cpu]>=0.2.16,!=0.3.2" \
|
||||
"flax>=0.4.1" \
|
||||
"jaxlib>=0.1.65" && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1,44 +0,0 @@
|
||||
FROM ubuntu:20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
"jax[tpu]>=0.2.16,!=0.3.2" \
|
||||
-f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
|
||||
python3 -m pip install --upgrade --no-cache-dir \
|
||||
clu \
|
||||
"flax>=0.4.1" \
|
||||
"jaxlib>=0.1.65" && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1,42 +0,0 @@
|
||||
FROM ubuntu:20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
onnxruntime \
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1,42 +0,0 @@
|
||||
FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
"onnxruntime-gpu>=1.13.1" \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu117 && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1,41 +0,0 @@
|
||||
FROM ubuntu:20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1,41 +0,0 @@
|
||||
FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
python3.8-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu117 && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
modelcards \
|
||||
numpy \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -12,9 +12,9 @@
|
||||
title: "Loading Pipelines, Models, and Schedulers"
|
||||
- local: using-diffusers/configuration
|
||||
title: "Configuring Pipelines, Models, and Schedulers"
|
||||
- local: using-diffusers/custom_pipeline_overview
|
||||
title: "Loading and Adding Custom Pipelines"
|
||||
title: "Loading & Hub"
|
||||
- local: using-diffusers/custom_pipelines
|
||||
title: "Loading and Creating Custom Pipelines"
|
||||
title: "Loading"
|
||||
- sections:
|
||||
- local: using-diffusers/unconditional_image_generation
|
||||
title: "Unconditional Image Generation"
|
||||
@@ -24,10 +24,8 @@
|
||||
title: "Text-Guided Image-to-Image"
|
||||
- local: using-diffusers/inpaint
|
||||
title: "Text-Guided Image-Inpainting"
|
||||
- local: using-diffusers/custom_pipeline_examples
|
||||
title: "Community Pipelines"
|
||||
- local: using-diffusers/contribute_pipeline
|
||||
title: "How to contribute a Pipeline"
|
||||
- local: using-diffusers/custom
|
||||
title: "Create a custom pipeline"
|
||||
title: "Pipelines for Inference"
|
||||
title: "Using Diffusers"
|
||||
- sections:
|
||||
@@ -36,7 +34,7 @@
|
||||
- local: optimization/onnx
|
||||
title: "ONNX"
|
||||
- local: optimization/open_vino
|
||||
title: "OpenVINO"
|
||||
title: "Open Vino"
|
||||
- local: optimization/mps
|
||||
title: "MPS"
|
||||
title: "Optimization/Special Hardware"
|
||||
@@ -46,11 +44,9 @@
|
||||
- local: training/unconditional_training
|
||||
title: "Unconditional Image Generation"
|
||||
- local: training/text_inversion
|
||||
title: "Textual Inversion"
|
||||
- local: training/dreambooth
|
||||
title: "Dreambooth"
|
||||
title: "Text Inversion"
|
||||
- local: training/text2image
|
||||
title: "Text-to-image fine-tuning"
|
||||
title: "Text-to-image"
|
||||
title: "Training"
|
||||
- sections:
|
||||
- local: conceptual/stable_diffusion
|
||||
@@ -94,11 +90,5 @@
|
||||
title: "Stable Diffusion"
|
||||
- local: api/pipelines/stochastic_karras_ve
|
||||
title: "Stochastic Karras VE"
|
||||
- local: api/pipelines/dance_diffusion
|
||||
title: "Dance Diffusion"
|
||||
- local: api/pipelines/vq_diffusion
|
||||
title: "VQ Diffusion"
|
||||
- local: api/pipelines/repaint
|
||||
title: "RePaint"
|
||||
title: "Pipelines"
|
||||
title: "API"
|
||||
|
||||
@@ -32,9 +32,6 @@ Any pipeline object can be saved locally with [`~DiffusionPipeline.save_pretrain
|
||||
[[autodoc]] DiffusionPipeline
|
||||
- from_pretrained
|
||||
- save_pretrained
|
||||
- to
|
||||
- device
|
||||
- components
|
||||
|
||||
## ImagePipelineOutput
|
||||
By default diffusion pipelines return an object of class
|
||||
|
||||
@@ -22,9 +22,6 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
|
||||
## UNet2DOutput
|
||||
[[autodoc]] models.unet_2d.UNet2DOutput
|
||||
|
||||
## UNet1DModel
|
||||
[[autodoc]] UNet1DModel
|
||||
|
||||
## UNet2DModel
|
||||
[[autodoc]] UNet2DModel
|
||||
|
||||
@@ -49,12 +46,6 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
|
||||
## AutoencoderKL
|
||||
[[autodoc]] AutoencoderKL
|
||||
|
||||
## Transformer2DModel
|
||||
[[autodoc]] Transformer2DModel
|
||||
|
||||
## Transformer2DModelOutput
|
||||
[[autodoc]] models.attention.Transformer2DModelOutput
|
||||
|
||||
## FlaxModelMixin
|
||||
[[autodoc]] FlaxModelMixin
|
||||
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Dance Diffusion
|
||||
|
||||
## Overview
|
||||
|
||||
[Dance Diffusion](https://github.com/Harmonai-org/sample-generator) by Zach Evans.
|
||||
|
||||
Dance Diffusion is the first in a suite of generative audio tools for producers and musicians to be released by Harmonai.
|
||||
For more info or to get involved in the development of these tools, please visit https://harmonai.org and fill out the form on the front page.
|
||||
|
||||
The original codebase of this implementation can be found [here](https://github.com/Harmonai-org/sample-generator).
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Colab
|
||||
|---|---|:---:|
|
||||
| [pipeline_dance_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py) | *Unconditional Audio Generation* | - |
|
||||
|
||||
|
||||
## DanceDiffusionPipeline
|
||||
[[autodoc]] DanceDiffusionPipeline
|
||||
- __call__
|
||||
@@ -1,15 +1,3 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# DDIM
|
||||
|
||||
## Overview
|
||||
|
||||
@@ -1,15 +1,3 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# DDPM
|
||||
|
||||
## Overview
|
||||
|
||||
@@ -1,15 +1,3 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Latent Diffusion
|
||||
|
||||
## Overview
|
||||
|
||||
@@ -1,15 +1,3 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Unconditional Latent Diffusion
|
||||
|
||||
## Overview
|
||||
|
||||
@@ -28,7 +28,7 @@ or created independently from each other.
|
||||
|
||||
To that end, we strive to offer all open-sourced, state-of-the-art diffusion system under a unified API.
|
||||
More specifically, we strive to provide pipelines that
|
||||
- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LDMTextToImagePipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
|
||||
- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LatentDiffusionPipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
|
||||
- 2. have a simple user interface to run the model in inference (see the [Pipelines API](#pipelines-api) section),
|
||||
- 3. are easy to understand with code that is self-explanatory and can be read along-side the official paper (see [Pipelines summary](#pipelines-summary)),
|
||||
- 4. can easily be contributed by the community (see the [Contribution](#contribution) section).
|
||||
@@ -54,8 +54,6 @@ available a colab notebook to directly try them out.
|
||||
| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
|
||||
| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
|
||||
| [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
|
||||
| [vq_diffusion](./vq_diffusion) | [**Vector Quantized Diffusion Model for Text-to-Image Synthesis**](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
|
||||
| [repaint](./repaint) | [**RePaint: Inpainting using Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2201.09865) | Image Inpainting |
|
||||
|
||||
**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers.
|
||||
|
||||
@@ -69,8 +67,8 @@ Diffusion models often consist of multiple independently-trained models or other
|
||||
Each model has been trained independently on a different task and the scheduler can easily be swapped out and replaced with a different one.
|
||||
During inference, we however want to be able to easily load all components and use them in inference - even if one component, *e.g.* CLIP's text encoder, originates from a different library, such as [Transformers](https://github.com/huggingface/transformers). To that end, all pipelines provide the following functionality:
|
||||
|
||||
- [`from_pretrained` method](../diffusion_pipeline) that accepts a Hugging Face Hub repository id, *e.g.* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) or a path to a local directory, *e.g.*
|
||||
"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [runwayml/stable-diffusion-v1-5/model_index.json](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), which defines all components that should be
|
||||
- [`from_pretrained` method](../diffusion_pipeline) that accepts a Hugging Face Hub repository id, *e.g.* [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) or a path to a local directory, *e.g.*
|
||||
"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [CompVis/stable-diffusion-v1-4/model_index.json](https://huggingface.co/CompVis/stable-diffusion-v1-4/blob/main/model_index.json), which defines all components that should be
|
||||
loaded into the pipelines. More specifically, for each model/component one needs to define the format `<name>: ["<library>", "<class name>"]`. `<name>` is the attribute name given to the loaded instance of `<class name>` which can be found in the library or pipeline folder called `"<library>"`.
|
||||
- [`save_pretrained`](../diffusion_pipeline) that accepts a local path, *e.g.* `./stable-diffusion` under which all models/components of the pipeline will be saved. For each component/model a folder is created inside the local path that is named after the given attribute name, *e.g.* `./stable_diffusion/unet`.
|
||||
In addition, a `model_index.json` file is created at the root of the local path, *e.g.* `./stable_diffusion/model_index.json` so that the complete pipeline can again be instantiated
|
||||
@@ -102,7 +100,7 @@ logic including pre-processing, an unrolled diffusion loop, and post-processing
|
||||
# make sure you're logged in with `huggingface-cli login`
|
||||
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
@@ -125,7 +123,7 @@ from diffusers import StableDiffusionImg2ImgPipeline
|
||||
# load the pipeline
|
||||
device = "cuda"
|
||||
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", revision="fp16", torch_dtype=torch.float16
|
||||
"CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
|
||||
).to(device)
|
||||
|
||||
# let's download an initial image
|
||||
@@ -153,11 +151,11 @@ You can generate your own latents to reproduce results, or tweak your prompt on
|
||||
The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and text prompt.
|
||||
|
||||
```python
|
||||
import PIL
|
||||
import requests
|
||||
import torch
|
||||
from io import BytesIO
|
||||
|
||||
import requests
|
||||
import PIL
|
||||
|
||||
from diffusers import StableDiffusionInpaintPipeline
|
||||
|
||||
|
||||
@@ -172,15 +170,15 @@ mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data
|
||||
init_image = download_image(img_url).resize((512, 512))
|
||||
mask_image = download_image(mask_url).resize((512, 512))
|
||||
|
||||
device = "cuda"
|
||||
pipe = StableDiffusionInpaintPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
"CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
|
||||
).to(device)
|
||||
|
||||
prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
|
||||
image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
|
||||
prompt = "a cat sitting on a bench"
|
||||
images = pipe(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
|
||||
images[0].save("cat_on_bench.png")
|
||||
```
|
||||
|
||||
You can also run this example on colab [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
|
||||
|
||||
@@ -1,15 +1,3 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# PNDM
|
||||
|
||||
## Overview
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# RePaint
|
||||
|
||||
## Overview
|
||||
|
||||
[RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2201.09865) (PNDM) by Andreas Lugmayr, Martin Danelljan, Andres Romero, Fisher Yu, Radu Timofte, Luc Van Gool.
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
Free-form inpainting is the task of adding new content to an image in the regions specified by an arbitrary binary mask. Most existing approaches train for a certain distribution of masks, which limits their generalization capabilities to unseen mask types. Furthermore, training with pixel-wise and perceptual losses often leads to simple textural extensions towards the missing areas instead of semantically meaningful generation. In this work, we propose RePaint: A Denoising Diffusion Probabilistic Model (DDPM) based inpainting approach that is applicable to even extreme masks. We employ a pretrained unconditional DDPM as the generative prior. To condition the generation process, we only alter the reverse diffusion iterations by sampling the unmasked regions using the given image information. Since this technique does not modify or condition the original DDPM network itself, the model produces high-quality and diverse output images for any inpainting form. We validate our method for both faces and general-purpose image inpainting using standard and extreme masks.
|
||||
RePaint outperforms state-of-the-art Autoregressive, and GAN approaches for at least five out of six mask distributions.
|
||||
|
||||
The original codebase can be found [here](https://github.com/andreas128/RePaint).
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Colab
|
||||
|-------------------------------------------------------------------------------------------------------------------------------|--------------------|:---:|
|
||||
| [pipeline_repaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/repaint/pipeline_repaint.py) | *Image Inpainting* | - |
|
||||
|
||||
## Usage example
|
||||
|
||||
```python
|
||||
from io import BytesIO
|
||||
|
||||
import torch
|
||||
|
||||
import PIL
|
||||
import requests
|
||||
from diffusers import RePaintPipeline, RePaintScheduler
|
||||
|
||||
|
||||
def download_image(url):
|
||||
response = requests.get(url)
|
||||
return PIL.Image.open(BytesIO(response.content)).convert("RGB")
|
||||
|
||||
|
||||
img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png"
|
||||
mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
|
||||
|
||||
# Load the original image and the mask as PIL images
|
||||
original_image = download_image(img_url).resize((256, 256))
|
||||
mask_image = download_image(mask_url).resize((256, 256))
|
||||
|
||||
# Load the RePaint scheduler and pipeline based on a pretrained DDPM model
|
||||
scheduler = RePaintScheduler.from_config("google/ddpm-ema-celebahq-256")
|
||||
pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
generator = torch.Generator(device="cuda").manual_seed(0)
|
||||
output = pipe(
|
||||
original_image=original_image,
|
||||
mask_image=mask_image,
|
||||
num_inference_steps=250,
|
||||
eta=0.0,
|
||||
jump_length=10,
|
||||
jump_n_sample=10,
|
||||
generator=generator,
|
||||
)
|
||||
inpainted_image = output.images[0]
|
||||
```
|
||||
|
||||
## RePaintPipeline
|
||||
[[autodoc]] pipelines.repaint.pipeline_repaint.RePaintPipeline
|
||||
- __call__
|
||||
|
||||
@@ -1,15 +1,3 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Score SDE VE
|
||||
|
||||
## Overview
|
||||
|
||||
@@ -1,15 +1,3 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Stable diffusion pipelines
|
||||
|
||||
Stable Diffusion is a text-to-image _latent diffusion_ model created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/). It's trained on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) dataset. This model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts. With its 860M UNet and 123M text encoder, the model is relatively lightweight and can run on consumer GPUs.
|
||||
@@ -29,41 +17,6 @@ For more details about how Stable Diffusion works and how it differs from the ba
|
||||
| [pipeline_stable_diffusion_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) | *Image-to-Image Text-Guided Generation* | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) | [🤗 Diffuse the Rest](https://huggingface.co/spaces/huggingface/diffuse-the-rest)
|
||||
| [pipeline_stable_diffusion_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | **Experimental** – *Text-Guided Image Inpainting* | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb) | Coming soon
|
||||
|
||||
## Tips
|
||||
|
||||
### How to load and use different schedulers.
|
||||
|
||||
The stable diffusion pipeline uses [`PNDMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the stable diffusion pipeline such as [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
|
||||
To use a different scheduler, you can pass the `scheduler` argument to `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
|
||||
|
||||
euler_scheduler = EulerDiscreteScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=euler_scheduler)
|
||||
```
|
||||
|
||||
|
||||
### How to conver all use cases with multiple or single pipeline
|
||||
|
||||
If you want to use all possible use cases in a single `DiffusionPipeline` you can either:
|
||||
- Make use of the [Stable Diffusion Mega Pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community#stable-diffusion-mega) or
|
||||
- Make use of the `components` functionality to instantiate all components in the most memory-efficient way:
|
||||
|
||||
```python
|
||||
>>> from diffusers import (
|
||||
... StableDiffusionPipeline,
|
||||
... StableDiffusionImg2ImgPipeline,
|
||||
... StableDiffusionInpaintPipeline,
|
||||
... )
|
||||
|
||||
>>> img2text = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
>>> img2img = StableDiffusionImg2ImgPipeline(**img2text.components)
|
||||
>>> inpaint = StableDiffusionInpaintPipeline(**img2text.components)
|
||||
|
||||
>>> # now you can use img2text(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
|
||||
```
|
||||
|
||||
## StableDiffusionPipelineOutput
|
||||
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
|
||||
|
||||
|
||||
@@ -1,15 +1,3 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Stochastic Karras VE
|
||||
|
||||
## Overview
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# VQDiffusion
|
||||
|
||||
## Overview
|
||||
|
||||
[Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) by Shuyang Gu, Dong Chen, Jianmin Bao, Fang Wen, Bo Zhang, Dongdong Chen, Lu Yuan, Baining Guo
|
||||
|
||||
The abstract of the paper is the following:
|
||||
|
||||
We present the vector quantized diffusion (VQ-Diffusion) model for text-to-image generation. This method is based on a vector quantized variational autoencoder (VQ-VAE) whose latent space is modeled by a conditional variant of the recently developed Denoising Diffusion Probabilistic Model (DDPM). We find that this latent-space method is well-suited for text-to-image generation tasks because it not only eliminates the unidirectional bias with existing methods but also allows us to incorporate a mask-and-replace diffusion strategy to avoid the accumulation of errors, which is a serious problem with existing methods. Our experiments show that the VQ-Diffusion produces significantly better text-to-image generation results when compared with conventional autoregressive (AR) models with similar numbers of parameters. Compared with previous GAN-based text-to-image methods, our VQ-Diffusion can handle more complex scenes and improve the synthesized image quality by a large margin. Finally, we show that the image generation computation in our method can be made highly efficient by reparameterization. With traditional AR methods, the text-to-image generation time increases linearly with the output image resolution and hence is quite time consuming even for normal size images. The VQ-Diffusion allows us to achieve a better trade-off between quality and speed. Our experiments indicate that the VQ-Diffusion model with the reparameterization is fifteen times faster than traditional AR methods while achieving a better image quality.
|
||||
|
||||
The original codebase can be found [here](https://github.com/microsoft/VQ-Diffusion).
|
||||
|
||||
## Available Pipelines:
|
||||
|
||||
| Pipeline | Tasks | Colab
|
||||
|---|---|:---:|
|
||||
| [pipeline_vq_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py) | *Text-to-Image Generation* | - |
|
||||
|
||||
|
||||
## VQDiffusionPipeline
|
||||
[[autodoc]] pipelines.vq_diffusion.pipeline_vq_diffusion.VQDiffusionPipeline
|
||||
- __call__
|
||||
@@ -89,19 +89,13 @@ Original implementation can be found [here](https://github.com/crowsonkb/k-diffu
|
||||
|
||||
[[autodoc]] PNDMScheduler
|
||||
|
||||
#### variance exploding stochastic differential equation (VE-SDE) scheduler
|
||||
#### variance exploding stochastic differential equation (SDE) scheduler
|
||||
|
||||
Original paper can be found [here](https://arxiv.org/abs/2011.13456).
|
||||
|
||||
[[autodoc]] ScoreSdeVeScheduler
|
||||
|
||||
#### improved pseudo numerical methods for diffusion models (iPNDM)
|
||||
|
||||
Original implementation can be found [here](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296).
|
||||
|
||||
[[autodoc]] IPNDMScheduler
|
||||
|
||||
#### variance preserving stochastic differential equation (VP-SDE) scheduler
|
||||
#### variance preserving stochastic differential equation (SDE) scheduler
|
||||
|
||||
Original paper can be found [here](https://arxiv.org/abs/2011.13456).
|
||||
|
||||
@@ -112,34 +106,3 @@ Score SDE-VP is under construction.
|
||||
</Tip>
|
||||
|
||||
[[autodoc]] schedulers.scheduling_sde_vp.ScoreSdeVpScheduler
|
||||
|
||||
#### Euler scheduler
|
||||
|
||||
Euler scheduler (Algorithm 2) from the paper [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) by Karras et al. (2022). Based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by Katherine Crowson.
|
||||
Fast scheduler which often times generates good outputs with 20-30 steps.
|
||||
|
||||
[[autodoc]] EulerDiscreteScheduler
|
||||
|
||||
|
||||
#### Euler Ancestral scheduler
|
||||
|
||||
Ancestral sampling with Euler method steps. Based on the original (k-diffusion)[https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72] implementation by Katherine Crowson.
|
||||
Fast scheduler which often times generates good outputs with 20-30 steps.
|
||||
|
||||
[[autodoc]] EulerAncestralDiscreteScheduler
|
||||
|
||||
|
||||
#### VQDiffusionScheduler
|
||||
|
||||
Original paper can be found [here](https://arxiv.org/abs/2111.14822)
|
||||
|
||||
[[autodoc]] VQDiffusionScheduler
|
||||
|
||||
#### RePaint scheduler
|
||||
|
||||
DDPM-based inpainting scheduler for unsupervised inpainting with extreme masks.
|
||||
Intended for use with [`RePaintPipeline`].
|
||||
Based on the paper [RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2201.09865)
|
||||
and the original implementation by Andreas Lugmayr et al.: https://github.com/andreas128/RePaint
|
||||
|
||||
[[autodoc]] RePaintScheduler
|
||||
|
||||
@@ -12,4 +12,6 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Stable Diffusion
|
||||
|
||||
Please visit this [very in-detail blog post](https://huggingface.co/blog/stable_diffusion) on Stable Diffusion!
|
||||
Under construction 🚧
|
||||
|
||||
For now please visit this [very in-detail blog post](https://huggingface.co/blog/stable_diffusion)
|
||||
|
||||
@@ -34,7 +34,6 @@ available a colab notebook to directly try them out.
|
||||
|
||||
| Pipeline | Paper | Tasks | Colab
|
||||
|---|---|:---:|:---:|
|
||||
| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
|
||||
| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
|
||||
| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
|
||||
| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation |
|
||||
@@ -46,6 +45,5 @@ available a colab notebook to directly try them out.
|
||||
| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
|
||||
| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
|
||||
| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
|
||||
| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
|
||||
|
||||
**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers.
|
||||
|
||||
@@ -12,12 +12,9 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Installation
|
||||
|
||||
Install 🤗 Diffusers for whichever deep learning library you’re working with.
|
||||
Install Diffusers for with PyTorch. Support for other libraries will come in the future
|
||||
|
||||
🤗 Diffusers is tested on Python 3.7+, PyTorch 1.7.0+ and flax. Follow the installation instructions below for the deep learning library you are using:
|
||||
|
||||
- [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
|
||||
- [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
|
||||
🤗 Diffusers is tested on Python 3.7+, and PyTorch 1.7.0+.
|
||||
|
||||
## Install with pip
|
||||
|
||||
@@ -39,30 +36,12 @@ source .env/bin/activate
|
||||
|
||||
Now you're ready to install 🤗 Diffusers with the following command:
|
||||
|
||||
**For PyTorch**
|
||||
|
||||
```bash
|
||||
pip install diffusers["torch"]
|
||||
```
|
||||
|
||||
**For Flax**
|
||||
|
||||
```bash
|
||||
pip install diffusers["flax"]
|
||||
pip install diffusers
|
||||
```
|
||||
|
||||
## Install from source
|
||||
|
||||
Before intsalling `diffusers` from source, make sure you have `torch` and `accelerate` installed.
|
||||
|
||||
For `torch` installation refer to the `torch` [docs](https://pytorch.org/get-started/locally/#start-locally).
|
||||
|
||||
To install `accelerate`
|
||||
|
||||
```bash
|
||||
pip install accelerate
|
||||
```
|
||||
|
||||
Install 🤗 Diffusers from source with the following command:
|
||||
|
||||
```bash
|
||||
@@ -74,7 +53,7 @@ The `main` version is useful for staying up-to-date with the latest developments
|
||||
For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet.
|
||||
However, this means the `main` version may not always be stable.
|
||||
We strive to keep the `main` version operational, and most issues are usually resolved within a few hours or a day.
|
||||
If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues), so we can fix it even sooner!
|
||||
If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues) so we can fix it even sooner!
|
||||
|
||||
## Editable install
|
||||
|
||||
@@ -88,18 +67,7 @@ Clone the repository and install 🤗 Diffusers with the following commands:
|
||||
```bash
|
||||
git clone https://github.com/huggingface/diffusers.git
|
||||
cd diffusers
|
||||
```
|
||||
|
||||
**For PyTorch**
|
||||
|
||||
```
|
||||
pip install -e ".[torch]"
|
||||
```
|
||||
|
||||
**For Flax**
|
||||
|
||||
```
|
||||
pip install -e ".[flax]"
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
These commands will link the folder you cloned the repository to and your Python library paths.
|
||||
|
||||
@@ -14,21 +14,17 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for memory or speed.
|
||||
|
||||
|
||||
| | Latency | Speedup |
|
||||
| ---------------- | ------- | ------- |
|
||||
|------------------|---------|---------|
|
||||
| original | 9.50s | x1 |
|
||||
| cuDNN auto-tuner | 9.37s | x1.01 |
|
||||
| autocast (fp16) | 5.47s | x1.74 |
|
||||
| fp16 | 3.61s | x2.63 |
|
||||
| channels last | 3.30s | x2.88 |
|
||||
| autocast (fp16) | 5.47s | x1.91 |
|
||||
| fp16 | 3.61s | x2.91 |
|
||||
| channels last | 3.30s | x2.87 |
|
||||
| traced UNet | 3.21s | x2.96 |
|
||||
| memory efficient attention | 2.63s | x3.61 |
|
||||
|
||||
<em>
|
||||
obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from
|
||||
the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM
|
||||
steps.
|
||||
</em>
|
||||
<em>obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps.</em>
|
||||
|
||||
## Enable cuDNN auto-tuner
|
||||
|
||||
@@ -60,7 +56,7 @@ If you use a CUDA GPU, you can take advantage of `torch.autocast` to perform inf
|
||||
from torch import autocast
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
@@ -76,7 +72,7 @@ To save more GPU memory and get even more speed, you can load and run the model
|
||||
|
||||
```Python
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
@@ -91,10 +87,7 @@ image = pipe(prompt).images[0]
|
||||
For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once.
|
||||
|
||||
<Tip>
|
||||
Attention slicing is useful even if a batch size of just 1 is used - as long
|
||||
as the model uses more than one attention head. If there is more than one
|
||||
attention head the *QK^T* attention matrix can be computed sequentially for
|
||||
each head which can save a significant amount of memory.
|
||||
Attention slicing is useful even if a batch size of just 1 is used - as long as the model uses more than one attention head. If there is more than one attention head the *QK^T* attention matrix can be computed sequentially for each head which can save a significant amount of memory.
|
||||
</Tip>
|
||||
|
||||
To perform the attention computation sequentially over each head, you only need to invoke [`~StableDiffusionPipeline.enable_attention_slicing`] in your pipeline before inference, like here:
|
||||
@@ -104,7 +97,7 @@ import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
@@ -117,50 +110,6 @@ image = pipe(prompt).images[0]
|
||||
|
||||
There's a small performance penalty of about 10% slower inference times, but this method allows you to use Stable Diffusion in as little as 3.2 GB of VRAM!
|
||||
|
||||
## Offloading to CPU with accelerate for memory savings
|
||||
|
||||
For additional memory savings, you can offload the weights to CPU and load them to GPU when performing the forward pass.
|
||||
|
||||
To perform CPU offloading, all you have to do is invoke [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:
|
||||
|
||||
```Python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
And you can get the memory consumption to < 2GB.
|
||||
|
||||
If is also possible to chain it with attention slicing for minimal memory consumption, running it in as little as < 800mb of GPU vRAM:
|
||||
|
||||
```Python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
pipe.enable_attention_slicing(1)
|
||||
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
## Using Channels Last memory format
|
||||
|
||||
Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it's better to try it and see if it works for your model.
|
||||
@@ -203,7 +152,7 @@ def generate_inputs():
|
||||
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
@@ -267,7 +216,7 @@ class UNet2DConditionOutput:
|
||||
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
@@ -291,41 +240,3 @@ pipe.unet = TracedUNet()
|
||||
with torch.inference_mode():
|
||||
image = pipe([prompt] * 1, num_inference_steps=50).images[0]
|
||||
```
|
||||
|
||||
|
||||
## Memory Efficient Attention
|
||||
Recent work on optimizing the bandwitdh in the attention block have generated huge speed ups and gains in GPU memory usage. The most recent being Flash Attention (from @tridao, [code](https://github.com/HazyResearch/flash-attention), [paper](https://arxiv.org/pdf/2205.14135.pdf)) .
|
||||
Here are the speedups we obtain on a few Nvidia GPUs when running the inference at 512x512 with a batch size of 1 (one prompt):
|
||||
|
||||
| GPU | Base Attention FP16 | Memory Efficient Attention FP16 |
|
||||
|------------------ |--------------------- |--------------------------------- |
|
||||
| NVIDIA Tesla T4 | 3.5it/s | 5.5it/s |
|
||||
| NVIDIA 3060 RTX | 4.6it/s | 7.8it/s |
|
||||
| NVIDIA A10G | 8.88it/s | 15.6it/s |
|
||||
| NVIDIA RTX A6000 | 11.7it/s | 21.09it/s |
|
||||
| NVIDIA TITAN RTX | 12.51it/s | 18.22it/s |
|
||||
| A100-SXM4-40GB | 18.6it/s | 29.it/s |
|
||||
| A100-SXM-80GB | 18.7it/s | 29.5it/s |
|
||||
|
||||
To leverage it just make sure you have:
|
||||
- PyTorch > 1.12
|
||||
- Cuda available
|
||||
- Installed the [xformers](https://github.com/facebookresearch/xformers) library
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
pipe.enable_xformers_memory_efficient_attention()
|
||||
|
||||
with torch.inference_mode():
|
||||
sample = pipe("a small cat")
|
||||
|
||||
# optional: You can disable it via
|
||||
# pipe.disable_xformers_memory_efficient_attention()
|
||||
```
|
||||
@@ -17,10 +17,9 @@ specific language governing permissions and limitations under the License.
|
||||
## Requirements
|
||||
|
||||
- Mac computer with Apple silicon (M1/M2) hardware.
|
||||
- macOS 12.6 or later (13.0 or later recommended).
|
||||
- macOS 12.3 or later.
|
||||
- arm64 version of Python.
|
||||
- PyTorch 1.13. You can install it with `pip` or `conda` using the instructions in https://pytorch.org/get-started/locally/.
|
||||
|
||||
- PyTorch [Preview (Nightly)](https://pytorch.org/get-started/locally/), version `1.14.0.dev20221007` or later.
|
||||
|
||||
## Inference Pipeline
|
||||
|
||||
@@ -32,12 +31,9 @@ We recommend to "prime" the pipeline using an additional one-time pass through i
|
||||
# make sure you're logged in with `huggingface-cli login`
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
pipe = pipe.to("mps")
|
||||
|
||||
# Recommended if your computer has < 64 GB of RAM
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
|
||||
# First-time "warmup" pass (see explanation above)
|
||||
@@ -47,17 +43,16 @@ _ = pipe(prompt, num_inference_steps=1)
|
||||
image = pipe(prompt).images[0]
|
||||
```
|
||||
|
||||
## Performance Recommendations
|
||||
|
||||
M1/M2 performance is very sensitive to memory pressure. The system will automatically swap if it needs to, but performance will degrade significantly when it does.
|
||||
|
||||
We recommend you use _attention slicing_ to reduce memory pressure during inference and prevent swapping, particularly if your computer has lass than 64 GB of system RAM, or if you generate images at non-standard resolutions larger than 512 × 512 pixels. Attention slicing performs the costly attention operation in multiple steps instead of all at once. It usually has a performance impact of ~20% in computers without universal memory, but we have observed _better performance_ in most Apple Silicon computers, unless you have 64 GB or more.
|
||||
|
||||
```python
|
||||
pipeline.enable_attention_slicing()
|
||||
```
|
||||
|
||||
## Known Issues
|
||||
|
||||
- As mentioned above, we are investigating a strange [first-time inference issue](https://github.com/huggingface/diffusers/issues/372).
|
||||
- Generating multiple prompts in a batch [crashes or doesn't work reliably](https://github.com/huggingface/diffusers/issues/363). We believe this is related to the [`mps` backend in PyTorch](https://github.com/pytorch/pytorch/issues/84039). This is being resolved, but for now we recommend to iterate instead of batching.
|
||||
- Generating multiple prompts in a batch [crashes or doesn't work reliably](https://github.com/huggingface/diffusers/issues/363). We believe this might be related to the [`mps` backend in PyTorch](https://github.com/pytorch/pytorch/issues/84039#issuecomment-1237735249), but we need to investigate in more depth. For now, we recommend to iterate instead of batching.
|
||||
|
||||
## Performance
|
||||
|
||||
These are the results we got on a M1 Max MacBook Pro with 64 GB of RAM, running macOS Ventura Version 13.0 Beta (22A5331f). We performed Stable Diffusion text-to-image generation of the same prompt for 50 inference steps, using a guidance scale of 7.5.
|
||||
|
||||
| Device | Steps | Time |
|
||||
|--------|-------|---------|
|
||||
| CPU | 50 | 213.46s |
|
||||
| MPS | 50 | 30.81s |
|
||||
@@ -28,7 +28,7 @@ The snippet below demonstrates how to use the ONNX runtime. You need to use `Sta
|
||||
from diffusers import StableDiffusionOnnxPipeline
|
||||
|
||||
pipe = StableDiffusionOnnxPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
revision="onnx",
|
||||
provider="CUDAExecutionProvider",
|
||||
)
|
||||
|
||||
@@ -68,7 +68,8 @@ You can save the image by simply calling:
|
||||
|
||||
More advanced models, like [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) require you to accept a [license](https://huggingface.co/spaces/CompVis/stable-diffusion-license) before running the model.
|
||||
This is due to the improved image generation capabilities of the model and the potentially harmful content that could be produced with it.
|
||||
Please, head over to your stable diffusion model of choice, *e.g.* [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license carefully and tick the checkbox if you agree.
|
||||
Long story short: Head over to your stable diffusion model of choice, *e.g.* [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4), read through the license and click-accept to get
|
||||
access to the model.
|
||||
You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
|
||||
Having "click-accepted" the license, you can save your token:
|
||||
|
||||
@@ -76,13 +77,13 @@ Having "click-accepted" the license, you can save your token:
|
||||
AUTH_TOKEN = "<please-fill-with-your-token>"
|
||||
```
|
||||
|
||||
You can then load [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)
|
||||
You can then load [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4)
|
||||
just like we did before only that now you need to pass your `AUTH_TOKEN`:
|
||||
|
||||
```python
|
||||
>>> from diffusers import DiffusionPipeline
|
||||
|
||||
>>> generator = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_auth_token=AUTH_TOKEN)
|
||||
>>> generator = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=AUTH_TOKEN)
|
||||
```
|
||||
|
||||
If you do not pass your authentication token you will see that the diffusion system will not be correctly
|
||||
@@ -94,15 +95,15 @@ the weights locally via:
|
||||
|
||||
```
|
||||
git lfs install
|
||||
git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
|
||||
git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
|
||||
```
|
||||
|
||||
and then load locally saved weights into the pipeline. This way, you do not need to pass an authentication
|
||||
token. Assuming that `"./stable-diffusion-v1-5"` is the local path to the cloned stable-diffusion-v1-5 repo,
|
||||
token. Assuming that `"./stable-diffusion-v1-4"` is the local path to the cloned stable-diffusion-v1-4 repo,
|
||||
you can also load the pipeline as follows:
|
||||
|
||||
```python
|
||||
>>> generator = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
|
||||
>>> generator = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-4")
|
||||
```
|
||||
|
||||
Running the pipeline is then identical to the code above as it's the same model architecture.
|
||||
@@ -121,10 +122,10 @@ you could use it as follows:
|
||||
```python
|
||||
>>> from diffusers import LMSDiscreteScheduler
|
||||
|
||||
>>> scheduler = LMSDiscreteScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
|
||||
>>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
|
||||
|
||||
>>> generator = StableDiffusionPipeline.from_pretrained(
|
||||
... "runwayml/stable-diffusion-v1-5", scheduler=scheduler, use_auth_token=AUTH_TOKEN
|
||||
... "CompVis/stable-diffusion-v1-4", scheduler=scheduler, use_auth_token=AUTH_TOKEN
|
||||
... )
|
||||
```
|
||||
|
||||
|
||||
@@ -1,240 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# DreamBooth fine-tuning example
|
||||
|
||||
[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text-to-image models like stable diffusion given just a few (3~5) images of a subject.
|
||||
|
||||

|
||||
_Dreambooth examples from the [project's blog](https://dreambooth.github.io)._
|
||||
|
||||
The [Dreambooth training script](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) shows how to implement this training procedure on a pre-trained Stable Diffusion model.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
<!-- TODO: replace with our blog when it's done -->
|
||||
|
||||
Dreambooth fine-tuning is very sensitive to hyperparameters and easy to overfit. We recommend you take a look at our [in-depth analysis](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) with recommended settings for different subjects, and go from there.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Training locally
|
||||
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies. We also recommend to install `diffusers` from the `main` github branch.
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/diffusers
|
||||
pip install -U -r diffusers/examples/dreambooth/requirements.txt
|
||||
```
|
||||
|
||||
Then initialize and configure a [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree.
|
||||
|
||||
You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
|
||||
|
||||
Run the following command to authenticate your token
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
If you have already cloned the repo, then you won't need to go through these steps. Instead, you can pass the path to your local checkout to the training script and it will be loaded from there.
|
||||
|
||||
### Dog toy example
|
||||
|
||||
In this example we'll use [these images](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) to add a new concept to Stable Diffusion using the Dreambooth process. They will be our training data. Please, download them and place them somewhere in your system.
|
||||
|
||||
Then you can launch the training script using:
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--max_train_steps=400
|
||||
```
|
||||
|
||||
### Training with a prior-preserving loss
|
||||
|
||||
Prior preservation is used to avoid overfitting and language-drift. Please, refer to the paper to learn more about it if you are interested. For prior preservation, we use other images of the same class as part of the training process. The nice thing is that we can generate those images using the Stable Diffusion model itself! The training script will save the generated images to a local path we specify.
|
||||
|
||||
According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior preservation. 200-300 works well for most cases.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export CLASS_DIR="path_to_class_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
|
||||
### Training on a 16GB GPU
|
||||
|
||||
With the help of gradient checkpointing and the 8-bit optimizer from [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), it's possible to train dreambooth on a 16GB GPU.
|
||||
|
||||
```bash
|
||||
pip install bitsandbytes
|
||||
```
|
||||
|
||||
Then pass the `--use_8bit_adam` option to the training script.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export CLASS_DIR="path_to_class_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=2 --gradient_checkpointing \
|
||||
--use_8bit_adam \
|
||||
--learning_rate=5e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
|
||||
### Fine-tune the text encoder in addition to the UNet
|
||||
|
||||
The script also allows to fine-tune the `text_encoder` along with the `unet`. It has been observed experimentally that this gives much better results, especially on faces. Please, refer to [our report](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) for more details.
|
||||
|
||||
To enable this option, pass the `--train_text_encoder` argument to the training script.
|
||||
|
||||
<Tip>
|
||||
Training the text encoder requires additional memory, so training won't fit on a 16GB GPU. You'll need at least 24GB VRAM to use this option.
|
||||
</Tip>
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export CLASS_DIR="path_to_class_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_text_encoder \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--use_8bit_adam
|
||||
--gradient_checkpointing \
|
||||
--learning_rate=2e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
|
||||
### Training on a 8 GB GPU:
|
||||
|
||||
Using [DeepSpeed](https://www.deepspeed.ai/) it's even possible to offload some
|
||||
tensors from VRAM to either CPU or NVME, allowing training to proceed with less GPU memory.
|
||||
|
||||
DeepSpeed needs to be enabled with `accelerate config`. During configuration,
|
||||
answer yes to "Do you want to use DeepSpeed?". Combining DeepSpeed stage 2, fp16
|
||||
mixed precision, and offloading both the model parameters and the optimizer state to CPU, it's
|
||||
possible to train on under 8 GB VRAM. The drawback is that this requires more system RAM (about 25 GB). See [the DeepSpeed documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more configuration options.
|
||||
|
||||
Changing the default Adam optimizer to DeepSpeed's special version of Adam
|
||||
`deepspeed.ops.adam.DeepSpeedCPUAdam` gives a substantial speedup, but enabling
|
||||
it requires the system's CUDA toolchain version to be the same as the one installed with PyTorch. 8-bit optimizers don't seem to be compatible with DeepSpeed at the moment.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path_to_training_images"
|
||||
export CLASS_DIR="path_to_class_images"
|
||||
export OUTPUT_DIR="path_to_saved_model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--sample_batch_size=1 \
|
||||
--gradient_accumulation_steps=1 --gradient_checkpointing \
|
||||
--learning_rate=5e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800 \
|
||||
--mixed_precision=fp16
|
||||
```
|
||||
|
||||
## Inference
|
||||
|
||||
Once you have trained a model, inference can be done using the `StableDiffusionPipeline`, by simply indicating the path where the model was saved. Make sure that your prompts include the special `identifier` used during training (`sks` in the previous examples).
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
|
||||
model_id = "path_to_saved_model"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
|
||||
|
||||
prompt = "A photo of sks dog in a bucket"
|
||||
image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
|
||||
|
||||
image.save("dog-bucket.png")
|
||||
```
|
||||
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# 🧨 Diffusers Training Examples
|
||||
|
||||
Diffusers training examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
|
||||
Diffusers examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
|
||||
for a variety of use cases.
|
||||
|
||||
**Note**: If you are looking for **official** examples on how to use `diffusers` for inference,
|
||||
@@ -36,15 +36,13 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
|
||||
- [Unconditional Training](./unconditional_training)
|
||||
- [Text-to-Image Training](./text2image)
|
||||
- [Text Inversion](./text_inversion)
|
||||
- [Dreambooth](./dreambooth)
|
||||
|
||||
|
||||
| Task | 🤗 Accelerate | 🤗 Datasets | Colab
|
||||
|---|---|:---:|:---:|
|
||||
| [**Unconditional Image Generation**](./unconditional_training) | ✅ | ✅ | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
|
||||
| [**Text-to-Image fine-tuning**](./text2image) | ✅ | ✅ |
|
||||
| [**Textual Inversion**](./text_inversion) | ✅ | - | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
|
||||
| [**Dreambooth**](./dreambooth) | ✅ | - | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
|
||||
| [**Text-to-Image**](./text2image) | - | - |
|
||||
| [**Text-Inversion**](./text_inversion) | ✅ | ✅ | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
|
||||
|
||||
## Community
|
||||
|
||||
|
||||
@@ -11,128 +11,6 @@ specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
|
||||
# Stable Diffusion text-to-image fine-tuning
|
||||
# Text-to-Image Training
|
||||
|
||||
The [`train_text_to_image.py`](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) script shows how to fine-tune the stable diffusion model on your own dataset.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
The text-to-image fine-tuning script is experimental. It's easy to overfit and run into issues like catastrophic forgetting. We recommend to explore different hyperparameters to get the best results on your dataset.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
## Running locally
|
||||
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/diffusers.git
|
||||
pip install -U -r requirements.txt
|
||||
```
|
||||
|
||||
And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree.
|
||||
|
||||
You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
|
||||
|
||||
Run the following command to authenticate your token
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
If you have already cloned the repo, then you won't need to go through these steps. Instead, you can pass the path to your local checkout to the training script and it will be loaded from there.
|
||||
|
||||
### Hardware Requirements for Fine-tuning
|
||||
|
||||
Using `gradient_checkpointing` and `mixed_precision` it should be possible to fine tune the model on a single 24GB GPU. For higher `batch_size` and faster training it's better to use GPUs with more than 30GB of GPU memory. You can also use JAX / Flax for fine-tuning on TPUs or GPUs, see [below](#flax-jax-finetuning) for details.
|
||||
|
||||
### Fine-tuning Example
|
||||
|
||||
The following script will launch a fine-tuning run using [Justin Pinkneys' captioned Pokemon dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions), available in Hugging Face Hub.
|
||||
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export dataset_name="lambdalabs/pokemon-blip-captions"
|
||||
|
||||
accelerate launch train_text_to_image.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--dataset_name=$dataset_name \
|
||||
--use_ema \
|
||||
--resolution=512 --center_crop --random_flip \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--gradient_checkpointing \
|
||||
--mixed_precision="fp16" \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-05 \
|
||||
--max_grad_norm=1 \
|
||||
--lr_scheduler="constant" --lr_warmup_steps=0 \
|
||||
--output_dir="sd-pokemon-model"
|
||||
```
|
||||
|
||||
To run on your own training files you need to prepare the dataset according to the format required by `datasets`. You can upload your dataset to the Hub, or you can prepare a local folder with your files. [This documentation](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata) explains how to do it.
|
||||
|
||||
You should modify the script if you wish to use custom loading logic. We have left pointers in the code in the appropriate places :)
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export TRAIN_DIR="path_to_your_dataset"
|
||||
export OUTPUT_DIR="path_to_save_model"
|
||||
|
||||
accelerate launch train_text_to_image.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_data_dir=$TRAIN_DIR \
|
||||
--use_ema \
|
||||
--resolution=512 --center_crop --random_flip \
|
||||
--train_batch_size=1 \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--gradient_checkpointing \
|
||||
--mixed_precision="fp16" \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-05 \
|
||||
--max_grad_norm=1 \
|
||||
--lr_scheduler="constant" --lr_warmup_steps=0 \
|
||||
--output_dir=${OUTPUT_DIR}
|
||||
```
|
||||
|
||||
Once training is finished the model will be saved to the `OUTPUT_DIR` specified in the command. To load the fine-tuned model for inference, just pass that path to `StableDiffusionPipeline`:
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
model_path = "path_to_saved_model"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
|
||||
pipe.to("cuda")
|
||||
|
||||
image = pipe(prompt="yoda").images[0]
|
||||
image.save("yoda-pokemon.png")
|
||||
```
|
||||
|
||||
### Flax / JAX fine-tuning
|
||||
|
||||
Thanks to [@duongna211](https://github.com/duongna21) it's possible to fine-tune Stable Diffusion using Flax! This is very efficient on TPU hardware but works great on GPUs too. You can use the [Flax training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_flax.py) like this:
|
||||
|
||||
```Python
|
||||
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
|
||||
export dataset_name="lambdalabs/pokemon-blip-captions"
|
||||
|
||||
python train_text_to_image_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--dataset_name=$dataset_name \
|
||||
--resolution=512 --center_crop --random_flip \
|
||||
--train_batch_size=1 \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-05 \
|
||||
--max_grad_norm=1 \
|
||||
--output_dir="sd-pokemon-model"
|
||||
```
|
||||
Under construction 🚧
|
||||
|
||||
@@ -49,7 +49,7 @@ The `textual_inversion.py` script [here](https://github.com/huggingface/diffuser
|
||||
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies.
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
```bash
|
||||
pip install diffusers[training] accelerate transformers
|
||||
@@ -83,7 +83,7 @@ Now let's get our dataset.Download 3-4 images from [here](https://drive.google.c
|
||||
And launch the training using
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export DATA_DIR="path-to-dir-containing-images"
|
||||
|
||||
accelerate launch textual_inversion.py \
|
||||
|
||||
@@ -12,10 +12,21 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
|
||||
|
||||
# Configuration
|
||||
# Quicktour
|
||||
|
||||
The handling of configurations in Diffusers is with the `ConfigMixin` class.
|
||||
Start using Diffusers🧨 quickly!
|
||||
To start, use the [`DiffusionPipeline`] for quick inference and sample generations!
|
||||
|
||||
```
|
||||
pip install diffusers
|
||||
```
|
||||
|
||||
## Main classes
|
||||
|
||||
### Models
|
||||
|
||||
### Schedulers
|
||||
|
||||
### Pipelines
|
||||
|
||||
[[autodoc]] ConfigMixin
|
||||
|
||||
Under further construction 🚧, open a [PR](https://github.com/huggingface/diffusers/compare) if you want to contribute!
|
||||
|
||||
@@ -1,169 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# How to build a community pipeline
|
||||
|
||||
*Note*: this page was built from the GitHub Issue on Community Pipelines [#841](https://github.com/huggingface/diffusers/issues/841).
|
||||
|
||||
Let's make an example!
|
||||
Say you want to define a pipeline that just does a single forward pass to a U-Net and then calls a scheduler only once (Note, this doesn't make any sense from a scientific point of view, but only represents an example of how things work under the hood).
|
||||
|
||||
Cool! So you open your favorite IDE and start creating your pipeline 💻.
|
||||
First, what model weights and configurations do we need?
|
||||
We have a U-Net and a scheduler, so our pipeline should take a U-Net and a scheduler as an argument.
|
||||
Also, as stated above, you'd like to be able to load weights and the scheduler config for Hub and share your code with others, so we'll inherit from `DiffusionPipeline`:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
```
|
||||
|
||||
Now, we must save the `unet` and `scheduler` in a config file so that you can save your pipeline with `save_pretrained`.
|
||||
Therefore, make sure you add every component that is save-able to the `register_modules` function:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
```
|
||||
|
||||
Cool, the init is done! 🔥 Now, let's go into the forward pass, which we recommend defining as `__call__` . Here you're given all the creative freedom there is. For our amazing "one-step" pipeline, we simply create a random image and call the unet once and the scheduler once:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
|
||||
def __call__(self):
|
||||
image = torch.randn(
|
||||
(1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
|
||||
)
|
||||
timestep = 1
|
||||
|
||||
model_output = self.unet(image, timestep).sample
|
||||
scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
|
||||
|
||||
return scheduler_output
|
||||
```
|
||||
|
||||
Cool, that's it! 🚀 You can now run this pipeline by passing a `unet` and a `scheduler` to the init:
|
||||
|
||||
```python
|
||||
from diffusers import DDPMScheduler, Unet2DModel
|
||||
|
||||
scheduler = DDPMScheduler()
|
||||
unet = UNet2DModel()
|
||||
|
||||
pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
|
||||
|
||||
output = pipeline()
|
||||
```
|
||||
|
||||
But what's even better is that you can load pre-existing weights into the pipeline if they match exactly your pipeline structure. This is e.g. the case for [https://huggingface.co/google/ddpm-cifar10-32](https://huggingface.co/google/ddpm-cifar10-32) so that we can do the following:
|
||||
|
||||
```python
|
||||
pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32")
|
||||
|
||||
output = pipeline()
|
||||
```
|
||||
|
||||
We want to share this amazing pipeline with the community, so we would open a PR request to add the following code under `one_step_unet.py` to [https://github.com/huggingface/diffusers/tree/main/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) .
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
|
||||
def __call__(self):
|
||||
image = torch.randn(
|
||||
(1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
|
||||
)
|
||||
timestep = 1
|
||||
|
||||
model_output = self.unet(image, timestep).sample
|
||||
scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
|
||||
|
||||
return scheduler_output
|
||||
```
|
||||
|
||||
Our amazing pipeline got merged here: [#840](https://github.com/huggingface/diffusers/pull/840).
|
||||
Now everybody that has `diffusers >= 0.4.0` installed can use our pipeline magically 🪄 as follows:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet")
|
||||
pipe()
|
||||
```
|
||||
|
||||
Another way to upload your custom_pipeline, besides sending a PR, is uploading the code that contains it to the Hugging Face Hub, [as exemplified here](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipelines#loading-custom-pipelines-from-the-hub).
|
||||
|
||||
**Try it out now - it works!**
|
||||
|
||||
In general, you will want to create much more sophisticated pipelines, so we recommend looking at existing pipelines here: [https://github.com/huggingface/diffusers/tree/main/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community).
|
||||
|
||||
IMPORTANT:
|
||||
You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` as this will be automatically detected.
|
||||
|
||||
## How do community pipelines work?
|
||||
A community pipeline is a class that has to inherit from ['DiffusionPipeline']:
|
||||
and that has been added to `examples/community` [files](https://github.com/huggingface/diffusers/tree/main/examples/community).
|
||||
The community can load the pipeline code via the custom_pipeline argument from DiffusionPipeline. See docs [here](https://huggingface.co/docs/diffusers/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.custom_pipeline):
|
||||
|
||||
This means:
|
||||
The model weights and configs of the pipeline should be loaded from the `pretrained_model_name_or_path` [argument](https://huggingface.co/docs/diffusers/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path):
|
||||
whereas the code that powers the community pipeline is defined in a file added in [`examples/community`](https://github.com/huggingface/diffusers/tree/main/examples/community).
|
||||
|
||||
Now, it might very well be that only some of your pipeline components weights can be downloaded from an official repo.
|
||||
The other components should then be passed directly to init as is the case for the ClIP guidance notebook [here](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb#scrollTo=z9Kglma6hjki).
|
||||
|
||||
The magic behind all of this is that we load the code directly from GitHub. You can check it out in more detail if you follow the functionality defined here:
|
||||
|
||||
```python
|
||||
# 2. Load the pipeline class, if using custom module then load it from the hub
|
||||
# if we load from explicit class, let's use it
|
||||
if custom_pipeline is not None:
|
||||
pipeline_class = get_class_from_dynamic_module(
|
||||
custom_pipeline, module_file=CUSTOM_PIPELINE_FILE_NAME, cache_dir=custom_pipeline
|
||||
)
|
||||
elif cls != DiffusionPipeline:
|
||||
pipeline_class = cls
|
||||
else:
|
||||
diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
|
||||
pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
|
||||
```
|
||||
|
||||
This is why a community pipeline merged to GitHub will be directly available to all `diffusers` packages.
|
||||
|
||||
15
docs/source/using-diffusers/custom.mdx
Normal file
15
docs/source/using-diffusers/custom.mdx
Normal file
@@ -0,0 +1,15 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Custom Pipeline
|
||||
|
||||
Under construction 🚧
|
||||
@@ -1,283 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Custom Pipelines
|
||||
|
||||
> **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).**
|
||||
|
||||
**Community** examples consist of both inference and training examples that have been added by the community.
|
||||
Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
|
||||
If a community doesn't work as expected, please open an issue and ping the author on it.
|
||||
|
||||
| Example | Description | Code Example | Colab | Author |
|
||||
|:---------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------:|
|
||||
| CLIP Guided Stable Diffusion | Doing CLIP guidance for text to image generation with Stable Diffusion | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion) | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) | [Suraj Patil](https://github.com/patil-suraj/) |
|
||||
| One Step U-Net (Dummy) | Example showcasing of how to use Community Pipelines (see https://github.com/huggingface/diffusers/issues/841) | [One Step U-Net](#one-step-unet) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
|
||||
| Stable Diffusion Interpolation | Interpolate the latent space of Stable Diffusion between different prompts/seeds | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | - | [Nate Raw](https://github.com/nateraw/) |
|
||||
| Stable Diffusion Mega | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
|
||||
| Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt. | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion) | - | [SkyTNT](https://github.com/SkyTNT) |
|
||||
| Speech to Image | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images | [Speech to Image](#speech-to-image) | - | [Mikail Duzenli](https://github.com/MikailINTech)
|
||||
|
||||
To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
|
||||
```py
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder"
|
||||
)
|
||||
```
|
||||
|
||||
## Example usages
|
||||
|
||||
### CLIP Guided Stable Diffusion
|
||||
|
||||
CLIP guided stable diffusion can help to generate more realistic images
|
||||
by guiding stable diffusion at every denoising step with an additional CLIP model.
|
||||
|
||||
The following code requires roughly 12GB of GPU RAM.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
from transformers import CLIPFeatureExtractor, CLIPModel
|
||||
import torch
|
||||
|
||||
|
||||
feature_extractor = CLIPFeatureExtractor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
|
||||
clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K", torch_dtype=torch.float16)
|
||||
|
||||
|
||||
guided_pipeline = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="clip_guided_stable_diffusion",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
guided_pipeline.enable_attention_slicing()
|
||||
guided_pipeline = guided_pipeline.to("cuda")
|
||||
|
||||
prompt = "fantasy book cover, full moon, fantasy forest landscape, golden vector elements, fantasy magic, dark light night, intricate, elegant, sharp focus, illustration, highly detailed, digital painting, concept art, matte, art by WLOP and Artgerm and Albert Bierstadt, masterpiece"
|
||||
|
||||
generator = torch.Generator(device="cuda").manual_seed(0)
|
||||
images = []
|
||||
for i in range(4):
|
||||
image = guided_pipeline(
|
||||
prompt,
|
||||
num_inference_steps=50,
|
||||
guidance_scale=7.5,
|
||||
clip_guidance_scale=100,
|
||||
num_cutouts=4,
|
||||
use_cutouts=False,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
images.append(image)
|
||||
|
||||
# save images locally
|
||||
for i, img in enumerate(images):
|
||||
img.save(f"./clip_guided_sd/image_{i}.png")
|
||||
```
|
||||
|
||||
The `images` list contains a list of PIL images that can be saved locally or displayed directly in a google colab.
|
||||
Generated images tend to be of higher qualtiy than natively using stable diffusion. E.g. the above script generates the following images:
|
||||
|
||||
.
|
||||
|
||||
### One Step Unet
|
||||
|
||||
The dummy "one-step-unet" can be run as follows:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet")
|
||||
pipe()
|
||||
```
|
||||
|
||||
**Note**: This community pipeline is not useful as a feature, but rather just serves as an example of how community pipelines can be added (see https://github.com/huggingface/diffusers/issues/841).
|
||||
|
||||
### Stable Diffusion Interpolation
|
||||
|
||||
The following code can be run on a GPU of at least 8GB VRAM and should take approximately 5 minutes.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
safety_checker=None, # Very important for videos...lots of false positives while interpolating
|
||||
custom_pipeline="interpolate_stable_diffusion",
|
||||
).to("cuda")
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
frame_filepaths = pipe.walk(
|
||||
prompts=["a dog", "a cat", "a horse"],
|
||||
seeds=[42, 1337, 1234],
|
||||
num_interpolation_steps=16,
|
||||
output_dir="./dreams",
|
||||
batch_size=4,
|
||||
height=512,
|
||||
width=512,
|
||||
guidance_scale=8.5,
|
||||
num_inference_steps=50,
|
||||
)
|
||||
```
|
||||
|
||||
The output of the `walk(...)` function returns a list of images saved under the folder as defined in `output_dir`. You can use these images to create videos of stable diffusion.
|
||||
|
||||
> **Please have a look at https://github.com/nateraw/stable-diffusion-videos for more in-detail information on how to create videos using stable diffusion as well as more feature-complete functionality.**
|
||||
|
||||
### Stable Diffusion Mega
|
||||
|
||||
The Stable Diffusion Mega Pipeline lets you use the main use cases of the stable diffusion pipeline in a single class.
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
from diffusers import DiffusionPipeline
|
||||
import PIL
|
||||
import requests
|
||||
from io import BytesIO
|
||||
import torch
|
||||
|
||||
|
||||
def download_image(url):
|
||||
response = requests.get(url)
|
||||
return PIL.Image.open(BytesIO(response.content)).convert("RGB")
|
||||
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="stable_diffusion_mega",
|
||||
torch_dtype=torch.float16,
|
||||
revision="fp16",
|
||||
)
|
||||
pipe.to("cuda")
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
|
||||
### Text-to-Image
|
||||
|
||||
images = pipe.text2img("An astronaut riding a horse").images
|
||||
|
||||
### Image-to-Image
|
||||
|
||||
init_image = download_image(
|
||||
"https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
|
||||
)
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
### Inpainting
|
||||
|
||||
img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
|
||||
mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
|
||||
init_image = download_image(img_url).resize((512, 512))
|
||||
mask_image = download_image(mask_url).resize((512, 512))
|
||||
|
||||
prompt = "a cat sitting on a bench"
|
||||
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
```
|
||||
|
||||
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
|
||||
|
||||
### Long Prompt Weighting Stable Diffusion
|
||||
|
||||
The Pipeline lets you input prompt without 77 token length limit. And you can increase words weighting by using "()" or decrease words weighting by using "[]"
|
||||
The Pipeline also lets you use the main use cases of the stable diffusion pipeline in a single class.
|
||||
|
||||
#### pytorch
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"hakurei/waifu-diffusion", custom_pipeline="lpw_stable_diffusion", revision="fp16", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "best_quality (1girl:1.3) bow bride brown_hair closed_mouth frilled_bow frilled_hair_tubes frills (full_body:1.3) fox_ear hair_bow hair_tubes happy hood japanese_clothes kimono long_sleeves red_bow smile solo tabi uchikake white_kimono wide_sleeves cherry_blossoms"
|
||||
neg_prompt = "lowres, bad_anatomy, error_body, error_hair, error_arm, error_hands, bad_hands, error_fingers, bad_fingers, missing_fingers, error_legs, bad_legs, multiple_legs, missing_legs, error_lighting, error_shadow, error_reflection, text, error, extra_digit, fewer_digits, cropped, worst_quality, low_quality, normal_quality, jpeg_artifacts, signature, watermark, username, blurry"
|
||||
|
||||
pipe.text2img(prompt, negative_prompt=neg_prompt, width=512, height=512, max_embeddings_multiples=3).images[0]
|
||||
```
|
||||
|
||||
#### onnxruntime
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="lpw_stable_diffusion_onnx",
|
||||
revision="onnx",
|
||||
provider="CUDAExecutionProvider",
|
||||
)
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars, best quality"
|
||||
neg_prompt = "lowres, bad anatomy, error body, error hair, error arm, error hands, bad hands, error fingers, bad fingers, missing fingers, error legs, bad legs, multiple legs, missing legs, error lighting, error shadow, error reflection, text, error, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
|
||||
|
||||
pipe.text2img(prompt, negative_prompt=neg_prompt, width=512, height=512, max_embeddings_multiples=3).images[0]
|
||||
```
|
||||
|
||||
if you see `Token indices sequence length is longer than the specified maximum sequence length for this model ( *** > 77 ) . Running this sequence through the model will result in indexing errors`. Do not worry, it is normal.
|
||||
|
||||
### Speech to Image
|
||||
|
||||
The following code can generate an image from an audio sample using pre-trained OpenAI whisper-small and Stable Diffusion.
|
||||
|
||||
```Python
|
||||
import torch
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from datasets import load_dataset
|
||||
from diffusers import DiffusionPipeline
|
||||
from transformers import (
|
||||
WhisperForConditionalGeneration,
|
||||
WhisperProcessor,
|
||||
)
|
||||
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
|
||||
audio_sample = ds[3]
|
||||
|
||||
text = audio_sample["text"].lower()
|
||||
speech_data = audio_sample["audio"]["array"]
|
||||
|
||||
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
|
||||
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
|
||||
|
||||
diffuser_pipeline = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="speech_to_image_diffusion",
|
||||
speech_model=model,
|
||||
speech_processor=processor,
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
diffuser_pipeline.enable_attention_slicing()
|
||||
diffuser_pipeline = diffuser_pipeline.to(device)
|
||||
|
||||
output = diffuser_pipeline(speech_data)
|
||||
plt.imshow(output.images[0])
|
||||
```
|
||||
This example produces the following image:
|
||||
|
||||

|
||||
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Loading and Saving Custom Pipelines
|
||||
# Custom Pipelines
|
||||
|
||||
Diffusers allows you to conveniently load any custom pipeline from the Hugging Face Hub as well as any [official community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community)
|
||||
via the [`DiffusionPipeline`] class.
|
||||
@@ -58,7 +58,7 @@ feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id)
|
||||
clip_model = CLIPModel.from_pretrained(clip_model_id)
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="clip_guided_stable_diffusion",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
@@ -25,7 +25,7 @@ from diffusers import StableDiffusionImg2ImgPipeline
|
||||
# load the pipeline
|
||||
device = "cuda"
|
||||
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5", revision="fp16", torch_dtype=torch.float16
|
||||
"CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
|
||||
).to(device)
|
||||
|
||||
# let's download an initial image
|
||||
|
||||
@@ -12,20 +12,14 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Text-Guided Image-Inpainting
|
||||
|
||||
The [`StableDiffusionInpaintPipeline`] lets you edit specific parts of an image by providing a mask and a text prompt. It uses a version of Stable Diffusion specifically trained for in-painting tasks.
|
||||
|
||||
<Tip warning={true}>
|
||||
Note that this model is distributed separately from the regular Stable Diffusion model, so you have to accept its license even if you accepted the Stable Diffusion one in the past.
|
||||
|
||||
Please, visit the [model card](https://huggingface.co/runwayml/stable-diffusion-inpainting), read the license carefully and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section](https://huggingface.co/docs/hub/security-tokens) of the documentation.
|
||||
</Tip>
|
||||
The [`StableDiffusionInpaintPipeline`] lets you edit specific parts of an image by providing a mask and text prompt.
|
||||
|
||||
```python
|
||||
import PIL
|
||||
import requests
|
||||
import torch
|
||||
from io import BytesIO
|
||||
|
||||
import requests
|
||||
import PIL
|
||||
|
||||
from diffusers import StableDiffusionInpaintPipeline
|
||||
|
||||
|
||||
@@ -40,24 +34,15 @@ mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data
|
||||
init_image = download_image(img_url).resize((512, 512))
|
||||
mask_image = download_image(mask_url).resize((512, 512))
|
||||
|
||||
device = "cuda"
|
||||
pipe = StableDiffusionInpaintPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-inpainting",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
"CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
|
||||
).to(device)
|
||||
|
||||
prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
|
||||
image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
|
||||
prompt = "a cat sitting on a bench"
|
||||
images = pipe(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
|
||||
images[0].save("cat_on_bench.png")
|
||||
```
|
||||
|
||||
`image` | `mask_image` | `prompt` | **Output** |
|
||||
:-------------------------:|:-------------------------:|:-------------------------:|-------------------------:|
|
||||
<img src="https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" alt="drawing" width="250"/> | <img src="https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" alt="drawing" width="250"/> | ***Face of a yellow cat, high resolution, sitting on a park bench*** | <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/test.png" alt="drawing" width="250"/> |
|
||||
|
||||
|
||||
You can also run this example on colab [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
|
||||
|
||||
<Tip warning={true}>
|
||||
A previous experimental implementation of in-painting used a different, lower-quality process. To ensure backwards compatibility, loading a pretrained pipeline that doesn't contain the new model will still apply the old in-painting method.
|
||||
</Tip>
|
||||
@@ -12,23 +12,4 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Loading
|
||||
|
||||
The core functionality for saving and loading systems in `Diffusers` is the HuggingFace Hub.
|
||||
|
||||
[[autodoc]] modeling_utils.ModelMixin
|
||||
- from_pretrained
|
||||
- save_pretrained
|
||||
|
||||
[[autodoc]] pipeline_utils.DiffusionPipeline
|
||||
- from_pretrained
|
||||
- save_pretrained
|
||||
|
||||
[[autodoc]] modeling_flax_utils.FlaxModelMixin
|
||||
- from_pretrained
|
||||
- save_pretrained
|
||||
|
||||
[[autodoc]] pipeline_flax_utils.FlaxDiffusionPipeline
|
||||
- from_pretrained
|
||||
- save_pretrained
|
||||
|
||||
|
||||
Under further construction 🚧, open a [PR](https://github.com/huggingface/diffusers/compare) if you want to contribute!
|
||||
Under construction 🚧
|
||||
|
||||
@@ -16,7 +16,7 @@ limitations under the License.
|
||||
# 🧨 Diffusers Examples
|
||||
|
||||
Diffusers examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
|
||||
for a variety of use cases involving training or fine-tuning.
|
||||
for a variety of use cases.
|
||||
|
||||
**Note**: If you are looking for **official** examples on how to use `diffusers` for inference,
|
||||
please have a look at [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines)
|
||||
@@ -38,11 +38,7 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
|
||||
|
||||
| Task | 🤗 Accelerate | 🤗 Datasets | Colab
|
||||
|---|---|:---:|:---:|
|
||||
| [**Unconditional Image Generation**](./unconditional_training) | ✅ | ✅ | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
|
||||
| [**Text-to-Image fine-tuning**](./text2image) | ✅ | ✅ |
|
||||
| [**Textual Inversion**](./text_inversion) | ✅ | - | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
|
||||
| [**Dreambooth**](./dreambooth) | ✅ | - | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
|
||||
|
||||
| [**Unconditional Image Generation**](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py) | ✅ | ✅ | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
|
||||
|
||||
## Community
|
||||
|
||||
|
||||
@@ -1,503 +1,7 @@
|
||||
# Community Examples
|
||||
|
||||
> **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).**
|
||||
|
||||
**Community** examples consist of both inference and training examples that have been added by the community.
|
||||
Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
|
||||
If a community doesn't work as expected, please open an issue and ping the author on it.
|
||||
|
||||
| Example | Description | Code Example | Colab | Author |
|
||||
|:---------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------:|
|
||||
| CLIP Guided Stable Diffusion | Doing CLIP guidance for text to image generation with Stable Diffusion | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion) | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) | [Suraj Patil](https://github.com/patil-suraj/) |
|
||||
| One Step U-Net (Dummy) | Example showcasing of how to use Community Pipelines (see https://github.com/huggingface/diffusers/issues/841) | [One Step U-Net](#one-step-unet) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
|
||||
| Stable Diffusion Interpolation | Interpolate the latent space of Stable Diffusion between different prompts/seeds | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | - | [Nate Raw](https://github.com/nateraw/) |
|
||||
| Stable Diffusion Mega | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
|
||||
| Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt. | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion) | - | [SkyTNT](https://github.com/SkyTNT) |
|
||||
| Speech to Image | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images | [Speech to Image](#speech-to-image) | - | [Mikail Duzenli](https://github.com/MikailINTech)
|
||||
| Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values | [Wildcard Stable Diffusion](#wildcard-stable-diffusion) | - | [Shyam Sudhakaran](https://github.com/shyamsn97) |
|
||||
| Composable Stable Diffusion| Stable Diffusion Pipeline that supports prompts that contain "|" in prompts (as an AND condition) and weights (separated by "|" as well) to positively / negatively weight prompts. | [Composable Stable Diffusion](#composable-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) |
|
||||
| Seed Resizing Stable Diffusion| Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation. | [Seed Resizing](#seed-resizing) | - | [Mark Rich](https://github.com/MarkRich) |
|
||||
|
||||
| Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image| [Imagic Stable Diffusion](#imagic-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) |
|
||||
|
||||
|
||||
To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
|
||||
```py
|
||||
pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="filename_in_the_community_folder")
|
||||
```
|
||||
|
||||
## Example usages
|
||||
|
||||
### CLIP Guided Stable Diffusion
|
||||
|
||||
CLIP guided stable diffusion can help to generate more realistic images
|
||||
by guiding stable diffusion at every denoising step with an additional CLIP model.
|
||||
|
||||
The following code requires roughly 12GB of GPU RAM.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
from transformers import CLIPFeatureExtractor, CLIPModel
|
||||
import torch
|
||||
|
||||
|
||||
feature_extractor = CLIPFeatureExtractor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
|
||||
clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K", torch_dtype=torch.float16)
|
||||
|
||||
|
||||
guided_pipeline = DiffusionPipeline.from_pretrained(
|
||||
"runwayml/stable-diffusion-v1-5",
|
||||
custom_pipeline="clip_guided_stable_diffusion",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
guided_pipeline.enable_attention_slicing()
|
||||
guided_pipeline = guided_pipeline.to("cuda")
|
||||
|
||||
prompt = "fantasy book cover, full moon, fantasy forest landscape, golden vector elements, fantasy magic, dark light night, intricate, elegant, sharp focus, illustration, highly detailed, digital painting, concept art, matte, art by WLOP and Artgerm and Albert Bierstadt, masterpiece"
|
||||
|
||||
generator = torch.Generator(device="cuda").manual_seed(0)
|
||||
images = []
|
||||
for i in range(4):
|
||||
image = guided_pipeline(
|
||||
prompt,
|
||||
num_inference_steps=50,
|
||||
guidance_scale=7.5,
|
||||
clip_guidance_scale=100,
|
||||
num_cutouts=4,
|
||||
use_cutouts=False,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
images.append(image)
|
||||
|
||||
# save images locally
|
||||
for i, img in enumerate(images):
|
||||
img.save(f"./clip_guided_sd/image_{i}.png")
|
||||
```
|
||||
|
||||
The `images` list contains a list of PIL images that can be saved locally or displayed directly in a google colab.
|
||||
Generated images tend to be of higher qualtiy than natively using stable diffusion. E.g. the above script generates the following images:
|
||||
|
||||
.
|
||||
|
||||
### One Step Unet
|
||||
|
||||
The dummy "one-step-unet" can be run as follows:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet")
|
||||
pipe()
|
||||
```
|
||||
|
||||
**Note**: This community pipeline is not useful as a feature, but rather just serves as an example of how community pipelines can be added (see https://github.com/huggingface/diffusers/issues/841).
|
||||
|
||||
### Stable Diffusion Interpolation
|
||||
|
||||
The following code can be run on a GPU of at least 8GB VRAM and should take approximately 5 minutes.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
revision='fp16',
|
||||
torch_dtype=torch.float16,
|
||||
safety_checker=None, # Very important for videos...lots of false positives while interpolating
|
||||
custom_pipeline="interpolate_stable_diffusion",
|
||||
).to('cuda')
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
frame_filepaths = pipe.walk(
|
||||
prompts=['a dog', 'a cat', 'a horse'],
|
||||
seeds=[42, 1337, 1234],
|
||||
num_interpolation_steps=16,
|
||||
output_dir='./dreams',
|
||||
batch_size=4,
|
||||
height=512,
|
||||
width=512,
|
||||
guidance_scale=8.5,
|
||||
num_inference_steps=50,
|
||||
)
|
||||
```
|
||||
|
||||
The output of the `walk(...)` function returns a list of images saved under the folder as defined in `output_dir`. You can use these images to create videos of stable diffusion.
|
||||
|
||||
> **Please have a look at https://github.com/nateraw/stable-diffusion-videos for more in-detail information on how to create videos using stable diffusion as well as more feature-complete functionality.**
|
||||
|
||||
### Stable Diffusion Mega
|
||||
|
||||
The Stable Diffusion Mega Pipeline lets you use the main use cases of the stable diffusion pipeline in a single class.
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
from diffusers import DiffusionPipeline
|
||||
import PIL
|
||||
import requests
|
||||
from io import BytesIO
|
||||
import torch
|
||||
|
||||
|
||||
def download_image(url):
|
||||
response = requests.get(url)
|
||||
return PIL.Image.open(BytesIO(response.content)).convert("RGB")
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_mega", torch_dtype=torch.float16, revision="fp16")
|
||||
pipe.to("cuda")
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
|
||||
### Text-to-Image
|
||||
|
||||
images = pipe.text2img("An astronaut riding a horse").images
|
||||
|
||||
### Image-to-Image
|
||||
|
||||
init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg")
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
### Inpainting
|
||||
|
||||
img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
|
||||
mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
|
||||
init_image = download_image(img_url).resize((512, 512))
|
||||
mask_image = download_image(mask_url).resize((512, 512))
|
||||
|
||||
prompt = "a cat sitting on a bench"
|
||||
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
```
|
||||
|
||||
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
|
||||
|
||||
### Long Prompt Weighting Stable Diffusion
|
||||
|
||||
The Pipeline lets you input prompt without 77 token length limit. And you can increase words weighting by using "()" or decrease words weighting by using "[]"
|
||||
The Pipeline also lets you use the main use cases of the stable diffusion pipeline in a single class.
|
||||
|
||||
#### pytorch
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
'hakurei/waifu-diffusion',
|
||||
custom_pipeline="lpw_stable_diffusion",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16
|
||||
)
|
||||
pipe=pipe.to("cuda")
|
||||
|
||||
prompt = "best_quality (1girl:1.3) bow bride brown_hair closed_mouth frilled_bow frilled_hair_tubes frills (full_body:1.3) fox_ear hair_bow hair_tubes happy hood japanese_clothes kimono long_sleeves red_bow smile solo tabi uchikake white_kimono wide_sleeves cherry_blossoms"
|
||||
neg_prompt = "lowres, bad_anatomy, error_body, error_hair, error_arm, error_hands, bad_hands, error_fingers, bad_fingers, missing_fingers, error_legs, bad_legs, multiple_legs, missing_legs, error_lighting, error_shadow, error_reflection, text, error, extra_digit, fewer_digits, cropped, worst_quality, low_quality, normal_quality, jpeg_artifacts, signature, watermark, username, blurry"
|
||||
|
||||
pipe.text2img(prompt, negative_prompt=neg_prompt, width=512,height=512,max_embeddings_multiples=3).images[0]
|
||||
|
||||
```
|
||||
|
||||
#### onnxruntime
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
'CompVis/stable-diffusion-v1-4',
|
||||
custom_pipeline="lpw_stable_diffusion_onnx",
|
||||
revision="onnx",
|
||||
provider="CUDAExecutionProvider"
|
||||
)
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars, best quality"
|
||||
neg_prompt = "lowres, bad anatomy, error body, error hair, error arm, error hands, bad hands, error fingers, bad fingers, missing fingers, error legs, bad legs, multiple legs, missing legs, error lighting, error shadow, error reflection, text, error, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
|
||||
|
||||
pipe.text2img(prompt,negative_prompt=neg_prompt, width=512, height=512, max_embeddings_multiples=3).images[0]
|
||||
|
||||
```
|
||||
|
||||
if you see `Token indices sequence length is longer than the specified maximum sequence length for this model ( *** > 77 ) . Running this sequence through the model will result in indexing errors`. Do not worry, it is normal.
|
||||
|
||||
### Speech to Image
|
||||
|
||||
The following code can generate an image from an audio sample using pre-trained OpenAI whisper-small and Stable Diffusion.
|
||||
|
||||
```Python
|
||||
import torch
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from datasets import load_dataset
|
||||
from diffusers import DiffusionPipeline
|
||||
from transformers import (
|
||||
WhisperForConditionalGeneration,
|
||||
WhisperProcessor,
|
||||
)
|
||||
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
|
||||
audio_sample = ds[3]
|
||||
|
||||
text = audio_sample["text"].lower()
|
||||
speech_data = audio_sample["audio"]["array"]
|
||||
|
||||
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
|
||||
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
|
||||
|
||||
diffuser_pipeline = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="speech_to_image_diffusion",
|
||||
speech_model=model,
|
||||
speech_processor=processor,
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
diffuser_pipeline.enable_attention_slicing()
|
||||
diffuser_pipeline = diffuser_pipeline.to(device)
|
||||
|
||||
output = diffuser_pipeline(speech_data)
|
||||
plt.imshow(output.images[0])
|
||||
```
|
||||
This example produces the following image:
|
||||
|
||||

|
||||
|
||||
### Wildcard Stable Diffusion
|
||||
Following the great examples from https://github.com/jtkelm2/stable-diffusion-webui-1/blob/master/scripts/wildcards.py and https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Custom-Scripts#wildcards, here's a minimal implementation that allows for users to add "wildcards", denoted by `__wildcard__` to prompts that are used as placeholders for randomly sampled values given by either a dictionary or a `.txt` file. For example:
|
||||
|
||||
Say we have a prompt:
|
||||
|
||||
```
|
||||
prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
|
||||
```
|
||||
|
||||
We can then define possible values to be sampled for `animal`, `object`, and `clothing`. These can either be from a `.txt` with the same name as the category.
|
||||
|
||||
The possible values can also be defined / combined by using a dictionary like: `{"animal":["dog", "cat", mouse"]}`.
|
||||
|
||||
The actual pipeline works just like `StableDiffusionPipeline`, except the `__call__` method takes in:
|
||||
|
||||
`wildcard_files`: list of file paths for wild card replacement
|
||||
`wildcard_option_dict`: dict with key as `wildcard` and values as a list of possible replacements
|
||||
`num_prompt_samples`: number of prompts to sample, uniformly sampling wildcards
|
||||
|
||||
A full example:
|
||||
|
||||
create `animal.txt`, with contents like:
|
||||
|
||||
```
|
||||
dog
|
||||
cat
|
||||
mouse
|
||||
```
|
||||
|
||||
create `object.txt`, with contents like:
|
||||
|
||||
```
|
||||
chair
|
||||
sofa
|
||||
bench
|
||||
```
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
custom_pipeline="wildcard_stable_diffusion",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
|
||||
out = pipe(
|
||||
prompt,
|
||||
wildcard_option_dict={
|
||||
"clothing":["hat", "shirt", "scarf", "beret"]
|
||||
},
|
||||
wildcard_files=["object.txt", "animal.txt"],
|
||||
num_prompt_samples=1
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
### Composable Stable diffusion
|
||||
|
||||
```python
|
||||
import torch as th
|
||||
import numpy as np
|
||||
import torchvision.utils as tvu
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
has_cuda = th.cuda.is_available()
|
||||
device = th.device('cpu' if not has_cuda else 'cuda')
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
use_auth_token=True,
|
||||
custom_pipeline="composable_stable_diffusion",
|
||||
).to(device)
|
||||
|
||||
|
||||
def dummy(images, **kwargs):
|
||||
return images, False
|
||||
|
||||
pipe.safety_checker = dummy
|
||||
|
||||
images = []
|
||||
generator = th.Generator("cuda").manual_seed(0)
|
||||
|
||||
seed = 0
|
||||
prompt = "a forest | a camel"
|
||||
weights = " 1 | 1" # Equal weight to each prompt. Can be negative
|
||||
|
||||
images = []
|
||||
for i in range(4):
|
||||
res = pipe(
|
||||
prompt,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=50,
|
||||
weights=weights,
|
||||
generator=generator)
|
||||
image = res.images[0]
|
||||
images.append(image)
|
||||
|
||||
for i, img in enumerate(images):
|
||||
img.save(f"./composable_diffusion/image_{i}.png")
|
||||
```
|
||||
|
||||
### Imagic Stable Diffusion
|
||||
Allows you to edit an image using stable diffusion.
|
||||
|
||||
```python
|
||||
import requests
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
has_cuda = torch.cuda.is_available()
|
||||
device = torch.device('cpu' if not has_cuda else 'cuda')
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
safety_checker=None,
|
||||
use_auth_token=True,
|
||||
custom_pipeline="imagic_stable_diffusion",
|
||||
scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
|
||||
).to(device)
|
||||
generator = th.Generator("cuda").manual_seed(0)
|
||||
seed = 0
|
||||
prompt = "A photo of Barack Obama smiling with a big grin"
|
||||
url = 'https://www.dropbox.com/s/6tlwzr73jd1r9yk/obama.png?dl=1'
|
||||
response = requests.get(url)
|
||||
init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = init_image.resize((512, 512))
|
||||
res = pipe.train(
|
||||
prompt,
|
||||
init_image,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=50,
|
||||
generator=generator)
|
||||
res = pipe(alpha=1)
|
||||
image = res.images[0]
|
||||
image.save('./imagic/imagic_image_alpha_1.png')
|
||||
res = pipe(alpha=1.5)
|
||||
image = res.images[0]
|
||||
image.save('./imagic/imagic_image_alpha_1_5.png')
|
||||
res = pipe(alpha=2)
|
||||
image = res.images[0]
|
||||
image.save('./imagic/imagic_image_alpha_2.png')
|
||||
```
|
||||
|
||||
### Seed Resizing
|
||||
Test seed resizing. Originally generate an image in 512 by 512, then generate image with same seed at 512 by 592 using seed resizing. Finally, generate 512 by 592 using original stable diffusion pipeline.
|
||||
|
||||
```python
|
||||
import torch as th
|
||||
import numpy as np
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
has_cuda = th.cuda.is_available()
|
||||
device = th.device('cpu' if not has_cuda else 'cuda')
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
use_auth_token=True,
|
||||
custom_pipeline="seed_resize_stable_diffusion"
|
||||
).to(device)
|
||||
|
||||
def dummy(images, **kwargs):
|
||||
return images, False
|
||||
|
||||
pipe.safety_checker = dummy
|
||||
|
||||
|
||||
images = []
|
||||
th.manual_seed(0)
|
||||
generator = th.Generator("cuda").manual_seed(0)
|
||||
|
||||
seed = 0
|
||||
prompt = "A painting of a futuristic cop"
|
||||
|
||||
width = 512
|
||||
height = 512
|
||||
|
||||
res = pipe(
|
||||
prompt,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=50,
|
||||
height=height,
|
||||
width=width,
|
||||
generator=generator)
|
||||
image = res.images[0]
|
||||
image.save('./seed_resize/seed_resize_{w}_{h}_image.png'.format(w=width, h=height))
|
||||
|
||||
|
||||
th.manual_seed(0)
|
||||
generator = th.Generator("cuda").manual_seed(0)
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
use_auth_token=True,
|
||||
custom_pipeline="/home/mark/open_source/diffusers/examples/community/"
|
||||
).to(device)
|
||||
|
||||
width = 512
|
||||
height = 592
|
||||
|
||||
res = pipe(
|
||||
prompt,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=50,
|
||||
height=height,
|
||||
width=width,
|
||||
generator=generator)
|
||||
image = res.images[0]
|
||||
image.save('./seed_resize/seed_resize_{w}_{h}_image.png'.format(w=width, h=height))
|
||||
|
||||
pipe_compare = DiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
use_auth_token=True,
|
||||
custom_pipeline="/home/mark/open_source/diffusers/examples/community/"
|
||||
).to(device)
|
||||
|
||||
res = pipe_compare(
|
||||
prompt,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=50,
|
||||
height=height,
|
||||
width=width,
|
||||
generator=generator
|
||||
)
|
||||
|
||||
image = res.images[0]
|
||||
image.save('./seed_resize/seed_resize_{w}_{h}_image_compare.png'.format(w=width, h=height))
|
||||
```
|
||||
| Example | Description | Author | Colab |
|
||||
|:----------|:----------------------|:-----------------|----------:|
|
||||
| CLIP Guided Stable Diffusion | Doing CLIP guidance for text to image generation with Stable Diffusion| [Suraj Patil](https://github.com/patil-suraj/) | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) |
|
||||
|
||||
@@ -249,7 +249,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
|
||||
latents_dtype = text_embeddings.dtype
|
||||
if latents is None:
|
||||
if self.device.type == "mps":
|
||||
# randn does not work reproducibly on mps
|
||||
# randn does not exist on mps
|
||||
latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
|
||||
self.device
|
||||
)
|
||||
|
||||
@@ -1,329 +0,0 @@
|
||||
"""
|
||||
modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
|
||||
"""
|
||||
import inspect
|
||||
import warnings
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
|
||||
class ComposableStableDiffusionPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion.
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
|
||||
Args:
|
||||
vae ([`AutoencoderKL`]):
|
||||
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
||||
text_encoder ([`CLIPTextModel`]):
|
||||
Frozen text-encoder. Stable Diffusion uses the text portion of
|
||||
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
||||
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
||||
tokenizer (`CLIPTokenizer`):
|
||||
Tokenizer of class
|
||||
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
|
||||
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
|
||||
scheduler ([`SchedulerMixin`]):
|
||||
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
|
||||
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
||||
safety_checker ([`StableDiffusionSafetyChecker`]):
|
||||
Classification module that estimates whether generated images could be considered offsensive or harmful.
|
||||
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
|
||||
feature_extractor ([`CLIPFeatureExtractor`]):
|
||||
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vae: AutoencoderKL,
|
||||
text_encoder: CLIPTextModel,
|
||||
tokenizer: CLIPTokenizer,
|
||||
unet: UNet2DConditionModel,
|
||||
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
|
||||
safety_checker: StableDiffusionSafetyChecker,
|
||||
feature_extractor: CLIPFeatureExtractor,
|
||||
):
|
||||
super().__init__()
|
||||
self.register_modules(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
When this option is enabled, the attention module will split the input tensor in slices, to compute attention
|
||||
in several steps. This is useful to save some memory in exchange for a small speed decrease.
|
||||
Args:
|
||||
slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
|
||||
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
|
||||
a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
|
||||
`attention_head_dim` must be a multiple of `slice_size`.
|
||||
"""
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
r"""
|
||||
Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
|
||||
back to computing attention in one step.
|
||||
"""
|
||||
# set slice_size = `None` to disable `attention slicing`
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: Optional[int] = 512,
|
||||
width: Optional[int] = 512,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
eta: Optional[float] = 0.0,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
latents: Optional[torch.FloatTensor] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
weights: Optional[str] = "",
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
|
||||
deterministic.
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
||||
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
|
||||
if "torch_device" in kwargs:
|
||||
device = kwargs.pop("torch_device")
|
||||
warnings.warn(
|
||||
"`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0."
|
||||
" Consider using `pipe.to(torch_device)` instead."
|
||||
)
|
||||
|
||||
# Set device as before (to be removed in 0.3.0)
|
||||
if device is None:
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.to(device)
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
||||
|
||||
if "|" in prompt:
|
||||
prompt = [x.strip() for x in prompt.split("|")]
|
||||
print(f"composing {prompt}...")
|
||||
|
||||
# get prompt text embeddings
|
||||
text_input = self.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
|
||||
|
||||
if not weights:
|
||||
# specify weights for prompts (excluding the unconditional score)
|
||||
print("using equal weights for all prompts...")
|
||||
pos_weights = torch.tensor(
|
||||
[1 / (text_embeddings.shape[0] - 1)] * (text_embeddings.shape[0] - 1), device=self.device
|
||||
).reshape(-1, 1, 1, 1)
|
||||
neg_weights = torch.tensor([1.0], device=self.device).reshape(-1, 1, 1, 1)
|
||||
mask = torch.tensor([False] + [True] * pos_weights.shape[0], dtype=torch.bool)
|
||||
else:
|
||||
# set prompt weight for each
|
||||
num_prompts = len(prompt) if isinstance(prompt, list) else 1
|
||||
weights = [float(w.strip()) for w in weights.split("|")]
|
||||
if len(weights) < num_prompts:
|
||||
weights.append(1.0)
|
||||
weights = torch.tensor(weights, device=self.device)
|
||||
assert len(weights) == text_embeddings.shape[0], "weights specified are not equal to the number of prompts"
|
||||
pos_weights = []
|
||||
neg_weights = []
|
||||
mask = [] # first one is unconditional score
|
||||
for w in weights:
|
||||
if w > 0:
|
||||
pos_weights.append(w)
|
||||
mask.append(True)
|
||||
else:
|
||||
neg_weights.append(abs(w))
|
||||
mask.append(False)
|
||||
# normalize the weights
|
||||
pos_weights = torch.tensor(pos_weights, device=self.device).reshape(-1, 1, 1, 1)
|
||||
pos_weights = pos_weights / pos_weights.sum()
|
||||
neg_weights = torch.tensor(neg_weights, device=self.device).reshape(-1, 1, 1, 1)
|
||||
neg_weights = neg_weights / neg_weights.sum()
|
||||
mask = torch.tensor(mask, device=self.device, dtype=torch.bool)
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
# get unconditional embeddings for classifier free guidance
|
||||
if do_classifier_free_guidance:
|
||||
max_length = text_input.input_ids.shape[-1]
|
||||
|
||||
if torch.all(mask):
|
||||
# no negative prompts, so we use empty string as the negative prompt
|
||||
uncond_input = self.tokenizer(
|
||||
[""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
|
||||
)
|
||||
uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
|
||||
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
|
||||
|
||||
# update negative weights
|
||||
neg_weights = torch.tensor([1.0], device=self.device)
|
||||
mask = torch.tensor([False] + mask.detach().tolist(), device=self.device, dtype=torch.bool)
|
||||
|
||||
# get the initial random noise unless the user supplied it
|
||||
|
||||
# Unlike in other pipelines, latents need to be generated in the target device
|
||||
# for 1-to-1 results reproducibility with the CompVis implementation.
|
||||
# However this currently doesn't work in `mps`.
|
||||
latents_device = "cpu" if self.device.type == "mps" else self.device
|
||||
latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
|
||||
if latents is None:
|
||||
latents = torch.randn(
|
||||
latents_shape,
|
||||
generator=generator,
|
||||
device=latents_device,
|
||||
)
|
||||
else:
|
||||
if latents.shape != latents_shape:
|
||||
raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
|
||||
latents = latents.to(self.device)
|
||||
|
||||
# set timesteps
|
||||
accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
|
||||
extra_set_kwargs = {}
|
||||
if accepts_offset:
|
||||
extra_set_kwargs["offset"] = 1
|
||||
|
||||
self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
|
||||
|
||||
# if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
|
||||
if isinstance(self.scheduler, LMSDiscreteScheduler):
|
||||
latents = latents * self.scheduler.sigmas[0]
|
||||
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
||||
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
||||
# and should be between [0, 1]
|
||||
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
||||
extra_step_kwargs = {}
|
||||
if accepts_eta:
|
||||
extra_step_kwargs["eta"] = eta
|
||||
|
||||
for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = (
|
||||
torch.cat([latents] * text_embeddings.shape[0]) if do_classifier_free_guidance else latents
|
||||
)
|
||||
if isinstance(self.scheduler, LMSDiscreteScheduler):
|
||||
sigma = self.scheduler.sigmas[i]
|
||||
# the model input needs to be scaled to match the continuous ODE formulation in K-LMS
|
||||
latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
|
||||
|
||||
# reduce memory by predicting each score sequentially
|
||||
noise_preds = []
|
||||
# predict the noise residual
|
||||
for latent_in, text_embedding_in in zip(
|
||||
torch.chunk(latent_model_input, chunks=latent_model_input.shape[0], dim=0),
|
||||
torch.chunk(text_embeddings, chunks=text_embeddings.shape[0], dim=0),
|
||||
):
|
||||
noise_preds.append(self.unet(latent_in, t, encoder_hidden_states=text_embedding_in).sample)
|
||||
noise_preds = torch.cat(noise_preds, dim=0)
|
||||
|
||||
# perform guidance
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond = (noise_preds[~mask] * neg_weights).sum(dim=0, keepdims=True)
|
||||
noise_pred_text = (noise_preds[mask] * pos_weights).sum(dim=0, keepdims=True)
|
||||
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
if isinstance(self.scheduler, LMSDiscreteScheduler):
|
||||
latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs).prev_sample
|
||||
else:
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# scale and decode the image latents with vae
|
||||
latents = 1 / 0.18215 * latents
|
||||
image = self.vae.decode(latents).sample
|
||||
|
||||
image = (image / 2 + 0.5).clamp(0, 1)
|
||||
image = image.cpu().permute(0, 2, 3, 1).numpy()
|
||||
|
||||
# run safety checker
|
||||
safety_cheker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(self.device)
|
||||
image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_cheker_input.pixel_values)
|
||||
|
||||
if output_type == "pil":
|
||||
image = self.numpy_to_pil(image)
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
||||
@@ -1,476 +0,0 @@
|
||||
"""
|
||||
modeled after the textual_inversion.py / train_dreambooth.py and the work
|
||||
of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
|
||||
"""
|
||||
import inspect
|
||||
import warnings
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
import PIL
|
||||
from accelerate import Accelerator
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
from diffusers.utils import logging
|
||||
from tqdm.auto import tqdm
|
||||
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
def preprocess(image):
|
||||
w, h = image.size
|
||||
w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
|
||||
image = image.resize((w, h), resample=PIL.Image.LANCZOS)
|
||||
image = np.array(image).astype(np.float32) / 255.0
|
||||
image = image[None].transpose(0, 3, 1, 2)
|
||||
image = torch.from_numpy(image)
|
||||
return 2.0 * image - 1.0
|
||||
|
||||
|
||||
class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for imagic image editing.
|
||||
See paper here: https://arxiv.org/pdf/2210.09276.pdf
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
|
||||
Args:
|
||||
vae ([`AutoencoderKL`]):
|
||||
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
||||
text_encoder ([`CLIPTextModel`]):
|
||||
Frozen text-encoder. Stable Diffusion uses the text portion of
|
||||
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
||||
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
||||
tokenizer (`CLIPTokenizer`):
|
||||
Tokenizer of class
|
||||
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
|
||||
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
|
||||
scheduler ([`SchedulerMixin`]):
|
||||
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
|
||||
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
||||
safety_checker ([`StableDiffusionSafetyChecker`]):
|
||||
Classification module that estimates whether generated images could be considered offsensive or harmful.
|
||||
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
|
||||
feature_extractor ([`CLIPFeatureExtractor`]):
|
||||
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vae: AutoencoderKL,
|
||||
text_encoder: CLIPTextModel,
|
||||
tokenizer: CLIPTokenizer,
|
||||
unet: UNet2DConditionModel,
|
||||
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
|
||||
safety_checker: StableDiffusionSafetyChecker,
|
||||
feature_extractor: CLIPFeatureExtractor,
|
||||
):
|
||||
super().__init__()
|
||||
self.register_modules(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
When this option is enabled, the attention module will split the input tensor in slices, to compute attention
|
||||
in several steps. This is useful to save some memory in exchange for a small speed decrease.
|
||||
Args:
|
||||
slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
|
||||
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
|
||||
a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
|
||||
`attention_head_dim` must be a multiple of `slice_size`.
|
||||
"""
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
r"""
|
||||
Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
|
||||
back to computing attention in one step.
|
||||
"""
|
||||
# set slice_size = `None` to disable `attention slicing`
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
def train(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
height: Optional[int] = 512,
|
||||
width: Optional[int] = 512,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
embedding_learning_rate: float = 0.001,
|
||||
diffusion_model_learning_rate: float = 2e-6,
|
||||
text_embedding_optimization_steps: int = 500,
|
||||
model_fine_tuning_optimization_steps: int = 1000,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
|
||||
deterministic.
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
||||
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
accelerator = Accelerator(
|
||||
gradient_accumulation_steps=1,
|
||||
mixed_precision="fp16",
|
||||
)
|
||||
|
||||
if "torch_device" in kwargs:
|
||||
device = kwargs.pop("torch_device")
|
||||
warnings.warn(
|
||||
"`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0."
|
||||
" Consider using `pipe.to(torch_device)` instead."
|
||||
)
|
||||
|
||||
if device is None:
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.to(device)
|
||||
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
||||
|
||||
# Freeze vae and unet
|
||||
self.vae.requires_grad_(False)
|
||||
self.unet.requires_grad_(False)
|
||||
self.text_encoder.requires_grad_(False)
|
||||
self.unet.eval()
|
||||
self.vae.eval()
|
||||
self.text_encoder.eval()
|
||||
|
||||
if accelerator.is_main_process:
|
||||
accelerator.init_trackers(
|
||||
"imagic",
|
||||
config={
|
||||
"embedding_learning_rate": embedding_learning_rate,
|
||||
"text_embedding_optimization_steps": text_embedding_optimization_steps,
|
||||
},
|
||||
)
|
||||
|
||||
# get text embeddings for prompt
|
||||
text_input = self.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
truncaton=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_embeddings = torch.nn.Parameter(
|
||||
self.text_encoder(text_input.input_ids.to(self.device))[0], requires_grad=True
|
||||
)
|
||||
text_embeddings = text_embeddings.detach()
|
||||
text_embeddings.requires_grad_()
|
||||
text_embeddings_orig = text_embeddings.clone()
|
||||
|
||||
# Initialize the optimizer
|
||||
optimizer = torch.optim.Adam(
|
||||
[text_embeddings], # only optimize the embeddings
|
||||
lr=embedding_learning_rate,
|
||||
)
|
||||
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
|
||||
latents_dtype = text_embeddings.dtype
|
||||
init_image = init_image.to(device=self.device, dtype=latents_dtype)
|
||||
init_latent_image_dist = self.vae.encode(init_image).latent_dist
|
||||
init_image_latents = init_latent_image_dist.sample(generator=generator)
|
||||
init_image_latents = 0.18215 * init_image_latents
|
||||
|
||||
progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process)
|
||||
progress_bar.set_description("Steps")
|
||||
|
||||
global_step = 0
|
||||
|
||||
logger.info("First optimizing the text embedding to better reconstruct the init image")
|
||||
for _ in range(text_embedding_optimization_steps):
|
||||
with accelerator.accumulate(text_embeddings):
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
|
||||
timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
|
||||
|
||||
# Predict the noise residual
|
||||
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
|
||||
|
||||
loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
|
||||
accelerator.backward(loss)
|
||||
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Checks if the accelerator has performed an optimization step behind the scenes
|
||||
if accelerator.sync_gradients:
|
||||
progress_bar.update(1)
|
||||
global_step += 1
|
||||
|
||||
logs = {"loss": loss.detach().item()} # , "lr": lr_scheduler.get_last_lr()[0]}
|
||||
progress_bar.set_postfix(**logs)
|
||||
accelerator.log(logs, step=global_step)
|
||||
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
text_embeddings.requires_grad_(False)
|
||||
|
||||
# Now we fine tune the unet to better reconstruct the image
|
||||
self.unet.requires_grad_(True)
|
||||
self.unet.train()
|
||||
optimizer = torch.optim.Adam(
|
||||
self.unet.parameters(), # only optimize unet
|
||||
lr=diffusion_model_learning_rate,
|
||||
)
|
||||
progress_bar = tqdm(range(model_fine_tuning_optimization_steps), disable=not accelerator.is_local_main_process)
|
||||
|
||||
logger.info("Next fine tuning the entire model to better reconstruct the init image")
|
||||
for _ in range(model_fine_tuning_optimization_steps):
|
||||
with accelerator.accumulate(self.unet.parameters()):
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
|
||||
timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
|
||||
|
||||
# Predict the noise residual
|
||||
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
|
||||
|
||||
loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
|
||||
accelerator.backward(loss)
|
||||
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Checks if the accelerator has performed an optimization step behind the scenes
|
||||
if accelerator.sync_gradients:
|
||||
progress_bar.update(1)
|
||||
global_step += 1
|
||||
|
||||
logs = {"loss": loss.detach().item()} # , "lr": lr_scheduler.get_last_lr()[0]}
|
||||
progress_bar.set_postfix(**logs)
|
||||
accelerator.log(logs, step=global_step)
|
||||
|
||||
accelerator.wait_for_everyone()
|
||||
self.text_embeddings_orig = text_embeddings_orig
|
||||
self.text_embeddings = text_embeddings
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
alpha: float = 1.2,
|
||||
height: Optional[int] = 512,
|
||||
width: Optional[int] = 512,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
guidance_scale: float = 7.5,
|
||||
eta: float = 0.0,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
|
||||
deterministic.
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
||||
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
||||
if self.text_embeddings is None:
|
||||
raise ValueError("Please run the pipe.train() before trying to generate an image.")
|
||||
if self.text_embeddings_orig is None:
|
||||
raise ValueError("Please run the pipe.train() before trying to generate an image.")
|
||||
|
||||
text_embeddings = alpha * self.text_embeddings_orig + (1 - alpha) * self.text_embeddings
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
# get unconditional embeddings for classifier free guidance
|
||||
if do_classifier_free_guidance:
|
||||
uncond_tokens = [""]
|
||||
max_length = self.tokenizer.model_max_length
|
||||
uncond_input = self.tokenizer(
|
||||
uncond_tokens,
|
||||
padding="max_length",
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
|
||||
|
||||
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
||||
seq_len = uncond_embeddings.shape[1]
|
||||
uncond_embeddings = uncond_embeddings.view(1, seq_len, -1)
|
||||
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
|
||||
|
||||
# get the initial random noise unless the user supplied it
|
||||
|
||||
# Unlike in other pipelines, latents need to be generated in the target device
|
||||
# for 1-to-1 results reproducibility with the CompVis implementation.
|
||||
# However this currently doesn't work in `mps`.
|
||||
latents_shape = (1, self.unet.in_channels, height // 8, width // 8)
|
||||
latents_dtype = text_embeddings.dtype
|
||||
if self.device.type == "mps":
|
||||
# randn does not exist on mps
|
||||
latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
|
||||
self.device
|
||||
)
|
||||
else:
|
||||
latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
|
||||
|
||||
# set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
|
||||
# Some schedulers like PNDM have timesteps as arrays
|
||||
# It's more optimized to move all timesteps to correct device beforehand
|
||||
timesteps_tensor = self.scheduler.timesteps.to(self.device)
|
||||
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
||||
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
||||
# and should be between [0, 1]
|
||||
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
||||
extra_step_kwargs = {}
|
||||
if accepts_eta:
|
||||
extra_step_kwargs["eta"] = eta
|
||||
|
||||
for i, t in enumerate(self.progress_bar(timesteps_tensor)):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
# predict the noise residual
|
||||
noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
|
||||
|
||||
# perform guidance
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
latents = 1 / 0.18215 * latents
|
||||
image = self.vae.decode(latents).sample
|
||||
|
||||
image = (image / 2 + 0.5).clamp(0, 1)
|
||||
|
||||
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
|
||||
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
||||
|
||||
if self.safety_checker is not None:
|
||||
safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
|
||||
self.device
|
||||
)
|
||||
image, has_nsfw_concept = self.safety_checker(
|
||||
images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
|
||||
)
|
||||
else:
|
||||
has_nsfw_concept = None
|
||||
|
||||
if output_type == "pil":
|
||||
image = self.numpy_to_pil(image)
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
||||
@@ -1,524 +0,0 @@
|
||||
import inspect
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Callable, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from diffusers.configuration_utils import FrozenDict
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
from diffusers.utils import deprecate, logging
|
||||
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
|
||||
"""helper function to spherically interpolate two arrays v1 v2"""
|
||||
|
||||
if not isinstance(v0, np.ndarray):
|
||||
inputs_are_torch = True
|
||||
input_device = v0.device
|
||||
v0 = v0.cpu().numpy()
|
||||
v1 = v1.cpu().numpy()
|
||||
|
||||
dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
|
||||
if np.abs(dot) > DOT_THRESHOLD:
|
||||
v2 = (1 - t) * v0 + t * v1
|
||||
else:
|
||||
theta_0 = np.arccos(dot)
|
||||
sin_theta_0 = np.sin(theta_0)
|
||||
theta_t = theta_0 * t
|
||||
sin_theta_t = np.sin(theta_t)
|
||||
s0 = np.sin(theta_0 - theta_t) / sin_theta_0
|
||||
s1 = sin_theta_t / sin_theta_0
|
||||
v2 = s0 * v0 + s1 * v1
|
||||
|
||||
if inputs_are_torch:
|
||||
v2 = torch.from_numpy(v2).to(input_device)
|
||||
|
||||
return v2
|
||||
|
||||
|
||||
class StableDiffusionWalkPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
|
||||
|
||||
Args:
|
||||
vae ([`AutoencoderKL`]):
|
||||
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
||||
text_encoder ([`CLIPTextModel`]):
|
||||
Frozen text-encoder. Stable Diffusion uses the text portion of
|
||||
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
||||
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
||||
tokenizer (`CLIPTokenizer`):
|
||||
Tokenizer of class
|
||||
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
|
||||
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
|
||||
scheduler ([`SchedulerMixin`]):
|
||||
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
|
||||
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
||||
safety_checker ([`StableDiffusionSafetyChecker`]):
|
||||
Classification module that estimates whether generated images could be considered offensive or harmful.
|
||||
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
|
||||
feature_extractor ([`CLIPFeatureExtractor`]):
|
||||
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vae: AutoencoderKL,
|
||||
text_encoder: CLIPTextModel,
|
||||
tokenizer: CLIPTokenizer,
|
||||
unet: UNet2DConditionModel,
|
||||
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
|
||||
safety_checker: StableDiffusionSafetyChecker,
|
||||
feature_extractor: CLIPFeatureExtractor,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
|
||||
deprecation_message = (
|
||||
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
|
||||
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
|
||||
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
|
||||
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
|
||||
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
|
||||
" file"
|
||||
)
|
||||
deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
|
||||
new_config = dict(scheduler.config)
|
||||
new_config["steps_offset"] = 1
|
||||
scheduler._internal_dict = FrozenDict(new_config)
|
||||
|
||||
if safety_checker is None:
|
||||
logger.warn(
|
||||
f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
|
||||
" that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
|
||||
" results in services or applications open to the public. Both the diffusers team and Hugging Face"
|
||||
" strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
|
||||
" it only for use-cases that involve analyzing network behavior or auditing its results. For more"
|
||||
" information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
|
||||
)
|
||||
|
||||
self.register_modules(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
|
||||
When this option is enabled, the attention module will split the input tensor in slices, to compute attention
|
||||
in several steps. This is useful to save some memory in exchange for a small speed decrease.
|
||||
|
||||
Args:
|
||||
slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
|
||||
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
|
||||
a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
|
||||
`attention_head_dim` must be a multiple of `slice_size`.
|
||||
"""
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
r"""
|
||||
Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
|
||||
back to computing attention in one step.
|
||||
"""
|
||||
# set slice_size = `None` to disable `attention slicing`
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Optional[Union[str, List[str]]] = None,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: float = 0.0,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
latents: Optional[torch.FloatTensor] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
text_embeddings: Optional[torch.FloatTensor] = None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]`, *optional*, defaults to `None`):
|
||||
The prompt or prompts to guide the image generation. If not provided, `text_embeddings` is required.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
|
||||
deterministic.
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
callback (`Callable`, *optional*):
|
||||
A function that will be called every `callback_steps` steps during inference. The function will be
|
||||
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
|
||||
callback_steps (`int`, *optional*, defaults to 1):
|
||||
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
||||
called at every step.
|
||||
text_embeddings (`torch.FloatTensor`, *optional*, defaults to `None`):
|
||||
Pre-generated text embeddings to be used as inputs for image generation. Can be used in place of
|
||||
`prompt` to avoid re-computing the embeddings. If not provided, the embeddings will be generated from
|
||||
the supplied `prompt`.
|
||||
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
||||
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
||||
|
||||
if (callback_steps is None) or (
|
||||
callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
|
||||
):
|
||||
raise ValueError(
|
||||
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
|
||||
f" {type(callback_steps)}."
|
||||
)
|
||||
|
||||
if text_embeddings is None:
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
# get prompt text embeddings
|
||||
text_inputs = self.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_input_ids = text_inputs.input_ids
|
||||
|
||||
if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
|
||||
removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
|
||||
print(
|
||||
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
||||
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
||||
)
|
||||
text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
|
||||
text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
|
||||
else:
|
||||
batch_size = text_embeddings.shape[0]
|
||||
|
||||
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
||||
bs_embed, seq_len, _ = text_embeddings.shape
|
||||
text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
|
||||
text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
# get unconditional embeddings for classifier free guidance
|
||||
if do_classifier_free_guidance:
|
||||
uncond_tokens: List[str]
|
||||
if negative_prompt is None:
|
||||
uncond_tokens = [""] * batch_size
|
||||
elif type(prompt) is not type(negative_prompt):
|
||||
raise TypeError(
|
||||
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
|
||||
f" {type(prompt)}."
|
||||
)
|
||||
elif isinstance(negative_prompt, str):
|
||||
uncond_tokens = [negative_prompt]
|
||||
elif batch_size != len(negative_prompt):
|
||||
raise ValueError(
|
||||
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
||||
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
||||
" the batch size of `prompt`."
|
||||
)
|
||||
else:
|
||||
uncond_tokens = negative_prompt
|
||||
|
||||
max_length = self.tokenizer.model_max_length
|
||||
uncond_input = self.tokenizer(
|
||||
uncond_tokens,
|
||||
padding="max_length",
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
|
||||
|
||||
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
||||
seq_len = uncond_embeddings.shape[1]
|
||||
uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
|
||||
uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
|
||||
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
|
||||
|
||||
# get the initial random noise unless the user supplied it
|
||||
|
||||
# Unlike in other pipelines, latents need to be generated in the target device
|
||||
# for 1-to-1 results reproducibility with the CompVis implementation.
|
||||
# However this currently doesn't work in `mps`.
|
||||
latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
|
||||
latents_dtype = text_embeddings.dtype
|
||||
if latents is None:
|
||||
if self.device.type == "mps":
|
||||
# randn does not work reproducibly on mps
|
||||
latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
|
||||
self.device
|
||||
)
|
||||
else:
|
||||
latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
|
||||
else:
|
||||
if latents.shape != latents_shape:
|
||||
raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
|
||||
latents = latents.to(self.device)
|
||||
|
||||
# set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
|
||||
# Some schedulers like PNDM have timesteps as arrays
|
||||
# It's more optimized to move all timesteps to correct device beforehand
|
||||
timesteps_tensor = self.scheduler.timesteps.to(self.device)
|
||||
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
||||
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
||||
# and should be between [0, 1]
|
||||
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
||||
extra_step_kwargs = {}
|
||||
if accepts_eta:
|
||||
extra_step_kwargs["eta"] = eta
|
||||
|
||||
for i, t in enumerate(self.progress_bar(timesteps_tensor)):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
# predict the noise residual
|
||||
noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
|
||||
|
||||
# perform guidance
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
latents = 1 / 0.18215 * latents
|
||||
image = self.vae.decode(latents).sample
|
||||
|
||||
image = (image / 2 + 0.5).clamp(0, 1)
|
||||
|
||||
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
|
||||
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
||||
|
||||
if self.safety_checker is not None:
|
||||
safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
|
||||
self.device
|
||||
)
|
||||
image, has_nsfw_concept = self.safety_checker(
|
||||
images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
|
||||
)
|
||||
else:
|
||||
has_nsfw_concept = None
|
||||
|
||||
if output_type == "pil":
|
||||
image = self.numpy_to_pil(image)
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
||||
|
||||
def embed_text(self, text):
|
||||
"""takes in text and turns it into text embeddings"""
|
||||
text_input = self.tokenizer(
|
||||
text,
|
||||
padding="max_length",
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
with torch.no_grad():
|
||||
embed = self.text_encoder(text_input.input_ids.to(self.device))[0]
|
||||
return embed
|
||||
|
||||
def get_noise(self, seed, dtype=torch.float32, height=512, width=512):
|
||||
"""Takes in random seed and returns corresponding noise vector"""
|
||||
return torch.randn(
|
||||
(1, self.unet.in_channels, height // 8, width // 8),
|
||||
generator=torch.Generator(device=self.device).manual_seed(seed),
|
||||
device=self.device,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
def walk(
|
||||
self,
|
||||
prompts: List[str],
|
||||
seeds: List[int],
|
||||
num_interpolation_steps: Optional[int] = 6,
|
||||
output_dir: Optional[str] = "./dreams",
|
||||
name: Optional[str] = None,
|
||||
batch_size: Optional[int] = 1,
|
||||
height: Optional[int] = 512,
|
||||
width: Optional[int] = 512,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
eta: Optional[float] = 0.0,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Walks through a series of prompts and seeds, interpolating between them and saving the results to disk.
|
||||
|
||||
Args:
|
||||
prompts (`List[str]`):
|
||||
List of prompts to generate images for.
|
||||
seeds (`List[int]`):
|
||||
List of seeds corresponding to provided prompts. Must be the same length as prompts.
|
||||
num_interpolation_steps (`int`, *optional*, defaults to 6):
|
||||
Number of interpolation steps to take between prompts.
|
||||
output_dir (`str`, *optional*, defaults to `./dreams`):
|
||||
Directory to save the generated images to.
|
||||
name (`str`, *optional*, defaults to `None`):
|
||||
Subdirectory of `output_dir` to save the generated images to. If `None`, the name will
|
||||
be the current time.
|
||||
batch_size (`int`, *optional*, defaults to 1):
|
||||
Number of images to generate at once.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
Height of the generated images.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
Width of the generated images.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
|
||||
Returns:
|
||||
`List[str]`: List of paths to the generated images.
|
||||
"""
|
||||
if not len(prompts) == len(seeds):
|
||||
raise ValueError(
|
||||
f"Number of prompts and seeds must be equalGot {len(prompts)} prompts and {len(seeds)} seeds"
|
||||
)
|
||||
|
||||
name = name or time.strftime("%Y%m%d-%H%M%S")
|
||||
save_path = Path(output_dir) / name
|
||||
save_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
frame_idx = 0
|
||||
frame_filepaths = []
|
||||
for prompt_a, prompt_b, seed_a, seed_b in zip(prompts, prompts[1:], seeds, seeds[1:]):
|
||||
# Embed Text
|
||||
embed_a = self.embed_text(prompt_a)
|
||||
embed_b = self.embed_text(prompt_b)
|
||||
|
||||
# Get Noise
|
||||
noise_dtype = embed_a.dtype
|
||||
noise_a = self.get_noise(seed_a, noise_dtype, height, width)
|
||||
noise_b = self.get_noise(seed_b, noise_dtype, height, width)
|
||||
|
||||
noise_batch, embeds_batch = None, None
|
||||
T = np.linspace(0.0, 1.0, num_interpolation_steps)
|
||||
for i, t in enumerate(T):
|
||||
noise = slerp(float(t), noise_a, noise_b)
|
||||
embed = torch.lerp(embed_a, embed_b, t)
|
||||
|
||||
noise_batch = noise if noise_batch is None else torch.cat([noise_batch, noise], dim=0)
|
||||
embeds_batch = embed if embeds_batch is None else torch.cat([embeds_batch, embed], dim=0)
|
||||
|
||||
batch_is_ready = embeds_batch.shape[0] == batch_size or i + 1 == T.shape[0]
|
||||
if batch_is_ready:
|
||||
outputs = self(
|
||||
latents=noise_batch,
|
||||
text_embeddings=embeds_batch,
|
||||
height=height,
|
||||
width=width,
|
||||
guidance_scale=guidance_scale,
|
||||
eta=eta,
|
||||
num_inference_steps=num_inference_steps,
|
||||
)
|
||||
noise_batch, embeds_batch = None, None
|
||||
|
||||
for image in outputs["images"]:
|
||||
frame_filepath = str(save_path / f"frame_{frame_idx:06d}.png")
|
||||
image.save(frame_filepath)
|
||||
frame_filepaths.append(frame_filepath)
|
||||
frame_idx += 1
|
||||
return frame_filepaths
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,992 +0,0 @@
|
||||
import inspect
|
||||
import re
|
||||
from typing import Callable, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
import PIL
|
||||
from diffusers.onnx_utils import OnnxRuntimeModel
|
||||
from diffusers.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
from diffusers.utils import logging
|
||||
from transformers import CLIPFeatureExtractor, CLIPTokenizer
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
re_attention = re.compile(
|
||||
r"""
|
||||
\\\(|
|
||||
\\\)|
|
||||
\\\[|
|
||||
\\]|
|
||||
\\\\|
|
||||
\\|
|
||||
\(|
|
||||
\[|
|
||||
:([+-]?[.\d]+)\)|
|
||||
\)|
|
||||
]|
|
||||
[^\\()\[\]:]+|
|
||||
:
|
||||
""",
|
||||
re.X,
|
||||
)
|
||||
|
||||
|
||||
def parse_prompt_attention(text):
|
||||
"""
|
||||
Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
|
||||
Accepted tokens are:
|
||||
(abc) - increases attention to abc by a multiplier of 1.1
|
||||
(abc:3.12) - increases attention to abc by a multiplier of 3.12
|
||||
[abc] - decreases attention to abc by a multiplier of 1.1
|
||||
\( - literal character '('
|
||||
\[ - literal character '['
|
||||
\) - literal character ')'
|
||||
\] - literal character ']'
|
||||
\\ - literal character '\'
|
||||
anything else - just text
|
||||
>>> parse_prompt_attention('normal text')
|
||||
[['normal text', 1.0]]
|
||||
>>> parse_prompt_attention('an (important) word')
|
||||
[['an ', 1.0], ['important', 1.1], [' word', 1.0]]
|
||||
>>> parse_prompt_attention('(unbalanced')
|
||||
[['unbalanced', 1.1]]
|
||||
>>> parse_prompt_attention('\(literal\]')
|
||||
[['(literal]', 1.0]]
|
||||
>>> parse_prompt_attention('(unnecessary)(parens)')
|
||||
[['unnecessaryparens', 1.1]]
|
||||
>>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
|
||||
[['a ', 1.0],
|
||||
['house', 1.5730000000000004],
|
||||
[' ', 1.1],
|
||||
['on', 1.0],
|
||||
[' a ', 1.1],
|
||||
['hill', 0.55],
|
||||
[', sun, ', 1.1],
|
||||
['sky', 1.4641000000000006],
|
||||
['.', 1.1]]
|
||||
"""
|
||||
|
||||
res = []
|
||||
round_brackets = []
|
||||
square_brackets = []
|
||||
|
||||
round_bracket_multiplier = 1.1
|
||||
square_bracket_multiplier = 1 / 1.1
|
||||
|
||||
def multiply_range(start_position, multiplier):
|
||||
for p in range(start_position, len(res)):
|
||||
res[p][1] *= multiplier
|
||||
|
||||
for m in re_attention.finditer(text):
|
||||
text = m.group(0)
|
||||
weight = m.group(1)
|
||||
|
||||
if text.startswith("\\"):
|
||||
res.append([text[1:], 1.0])
|
||||
elif text == "(":
|
||||
round_brackets.append(len(res))
|
||||
elif text == "[":
|
||||
square_brackets.append(len(res))
|
||||
elif weight is not None and len(round_brackets) > 0:
|
||||
multiply_range(round_brackets.pop(), float(weight))
|
||||
elif text == ")" and len(round_brackets) > 0:
|
||||
multiply_range(round_brackets.pop(), round_bracket_multiplier)
|
||||
elif text == "]" and len(square_brackets) > 0:
|
||||
multiply_range(square_brackets.pop(), square_bracket_multiplier)
|
||||
else:
|
||||
res.append([text, 1.0])
|
||||
|
||||
for pos in round_brackets:
|
||||
multiply_range(pos, round_bracket_multiplier)
|
||||
|
||||
for pos in square_brackets:
|
||||
multiply_range(pos, square_bracket_multiplier)
|
||||
|
||||
if len(res) == 0:
|
||||
res = [["", 1.0]]
|
||||
|
||||
# merge runs of identical weights
|
||||
i = 0
|
||||
while i + 1 < len(res):
|
||||
if res[i][1] == res[i + 1][1]:
|
||||
res[i][0] += res[i + 1][0]
|
||||
res.pop(i + 1)
|
||||
else:
|
||||
i += 1
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
|
||||
r"""
|
||||
Tokenize a list of prompts and return its tokens with weights of each token.
|
||||
|
||||
No padding, starting or ending token is included.
|
||||
"""
|
||||
tokens = []
|
||||
weights = []
|
||||
truncated = False
|
||||
for text in prompt:
|
||||
texts_and_weights = parse_prompt_attention(text)
|
||||
text_token = []
|
||||
text_weight = []
|
||||
for word, weight in texts_and_weights:
|
||||
# tokenize and discard the starting and the ending token
|
||||
token = pipe.tokenizer(word, return_tensors="np").input_ids[0, 1:-1]
|
||||
text_token += list(token)
|
||||
# copy the weight by length of token
|
||||
text_weight += [weight] * len(token)
|
||||
# stop if the text is too long (longer than truncation limit)
|
||||
if len(text_token) > max_length:
|
||||
truncated = True
|
||||
break
|
||||
# truncate
|
||||
if len(text_token) > max_length:
|
||||
truncated = True
|
||||
text_token = text_token[:max_length]
|
||||
text_weight = text_weight[:max_length]
|
||||
tokens.append(text_token)
|
||||
weights.append(text_weight)
|
||||
if truncated:
|
||||
logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
|
||||
return tokens, weights
|
||||
|
||||
|
||||
def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_middle=True, chunk_length=77):
|
||||
r"""
|
||||
Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
|
||||
"""
|
||||
max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
|
||||
weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
|
||||
for i in range(len(tokens)):
|
||||
tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
|
||||
if no_boseos_middle:
|
||||
weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
|
||||
else:
|
||||
w = []
|
||||
if len(weights[i]) == 0:
|
||||
w = [1.0] * weights_length
|
||||
else:
|
||||
for j in range(max_embeddings_multiples):
|
||||
w.append(1.0) # weight for starting token in this chunk
|
||||
w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
|
||||
w.append(1.0) # weight for ending token in this chunk
|
||||
w += [1.0] * (weights_length - len(w))
|
||||
weights[i] = w[:]
|
||||
|
||||
return tokens, weights
|
||||
|
||||
|
||||
def get_unweighted_text_embeddings(
|
||||
pipe,
|
||||
text_input: np.array,
|
||||
chunk_length: int,
|
||||
no_boseos_middle: Optional[bool] = True,
|
||||
):
|
||||
"""
|
||||
When the length of tokens is a multiple of the capacity of the text encoder,
|
||||
it should be split into chunks and sent to the text encoder individually.
|
||||
"""
|
||||
max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
|
||||
if max_embeddings_multiples > 1:
|
||||
text_embeddings = []
|
||||
for i in range(max_embeddings_multiples):
|
||||
# extract the i-th chunk
|
||||
text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].copy()
|
||||
|
||||
# cover the head and the tail by the starting and the ending tokens
|
||||
text_input_chunk[:, 0] = text_input[0, 0]
|
||||
text_input_chunk[:, -1] = text_input[0, -1]
|
||||
|
||||
text_embedding = pipe.text_encoder(input_ids=text_input_chunk)[0]
|
||||
|
||||
if no_boseos_middle:
|
||||
if i == 0:
|
||||
# discard the ending token
|
||||
text_embedding = text_embedding[:, :-1]
|
||||
elif i == max_embeddings_multiples - 1:
|
||||
# discard the starting token
|
||||
text_embedding = text_embedding[:, 1:]
|
||||
else:
|
||||
# discard both starting and ending tokens
|
||||
text_embedding = text_embedding[:, 1:-1]
|
||||
|
||||
text_embeddings.append(text_embedding)
|
||||
text_embeddings = np.concatenate(text_embeddings, axis=1)
|
||||
else:
|
||||
text_embeddings = pipe.text_encoder(input_ids=text_input)[0]
|
||||
return text_embeddings
|
||||
|
||||
|
||||
def get_weighted_text_embeddings(
|
||||
pipe,
|
||||
prompt: Union[str, List[str]],
|
||||
uncond_prompt: Optional[Union[str, List[str]]] = None,
|
||||
max_embeddings_multiples: Optional[int] = 4,
|
||||
no_boseos_middle: Optional[bool] = False,
|
||||
skip_parsing: Optional[bool] = False,
|
||||
skip_weighting: Optional[bool] = False,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Prompts can be assigned with local weights using brackets. For example,
|
||||
prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
|
||||
and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
|
||||
|
||||
Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
|
||||
|
||||
Args:
|
||||
pipe (`DiffusionPipeline`):
|
||||
Pipe to provide access to the tokenizer and the text encoder.
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
uncond_prompt (`str` or `List[str]`):
|
||||
The unconditional prompt or prompts for guide the image generation. If unconditional prompt
|
||||
is provided, the embeddings of prompt and uncond_prompt are concatenated.
|
||||
max_embeddings_multiples (`int`, *optional*, defaults to `1`):
|
||||
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
||||
no_boseos_middle (`bool`, *optional*, defaults to `False`):
|
||||
If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
|
||||
ending token in each of the chunk in the middle.
|
||||
skip_parsing (`bool`, *optional*, defaults to `False`):
|
||||
Skip the parsing of brackets.
|
||||
skip_weighting (`bool`, *optional*, defaults to `False`):
|
||||
Skip the weighting. When the parsing is skipped, it is forced True.
|
||||
"""
|
||||
max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
|
||||
if isinstance(prompt, str):
|
||||
prompt = [prompt]
|
||||
|
||||
if not skip_parsing:
|
||||
prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
|
||||
if uncond_prompt is not None:
|
||||
if isinstance(uncond_prompt, str):
|
||||
uncond_prompt = [uncond_prompt]
|
||||
uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
|
||||
else:
|
||||
prompt_tokens = [
|
||||
token[1:-1]
|
||||
for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True, return_tensors="np").input_ids
|
||||
]
|
||||
prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
|
||||
if uncond_prompt is not None:
|
||||
if isinstance(uncond_prompt, str):
|
||||
uncond_prompt = [uncond_prompt]
|
||||
uncond_tokens = [
|
||||
token[1:-1]
|
||||
for token in pipe.tokenizer(
|
||||
uncond_prompt,
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
return_tensors="np",
|
||||
).input_ids
|
||||
]
|
||||
uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
|
||||
|
||||
# round up the longest length of tokens to a multiple of (model_max_length - 2)
|
||||
max_length = max([len(token) for token in prompt_tokens])
|
||||
if uncond_prompt is not None:
|
||||
max_length = max(max_length, max([len(token) for token in uncond_tokens]))
|
||||
|
||||
max_embeddings_multiples = min(
|
||||
max_embeddings_multiples,
|
||||
(max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
|
||||
)
|
||||
max_embeddings_multiples = max(1, max_embeddings_multiples)
|
||||
max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
|
||||
|
||||
# pad the length of tokens and weights
|
||||
bos = pipe.tokenizer.bos_token_id
|
||||
eos = pipe.tokenizer.eos_token_id
|
||||
prompt_tokens, prompt_weights = pad_tokens_and_weights(
|
||||
prompt_tokens,
|
||||
prompt_weights,
|
||||
max_length,
|
||||
bos,
|
||||
eos,
|
||||
no_boseos_middle=no_boseos_middle,
|
||||
chunk_length=pipe.tokenizer.model_max_length,
|
||||
)
|
||||
prompt_tokens = np.array(prompt_tokens, dtype=np.int32)
|
||||
if uncond_prompt is not None:
|
||||
uncond_tokens, uncond_weights = pad_tokens_and_weights(
|
||||
uncond_tokens,
|
||||
uncond_weights,
|
||||
max_length,
|
||||
bos,
|
||||
eos,
|
||||
no_boseos_middle=no_boseos_middle,
|
||||
chunk_length=pipe.tokenizer.model_max_length,
|
||||
)
|
||||
uncond_tokens = np.array(uncond_tokens, dtype=np.int32)
|
||||
|
||||
# get the embeddings
|
||||
text_embeddings = get_unweighted_text_embeddings(
|
||||
pipe,
|
||||
prompt_tokens,
|
||||
pipe.tokenizer.model_max_length,
|
||||
no_boseos_middle=no_boseos_middle,
|
||||
)
|
||||
prompt_weights = np.array(prompt_weights, dtype=text_embeddings.dtype)
|
||||
if uncond_prompt is not None:
|
||||
uncond_embeddings = get_unweighted_text_embeddings(
|
||||
pipe,
|
||||
uncond_tokens,
|
||||
pipe.tokenizer.model_max_length,
|
||||
no_boseos_middle=no_boseos_middle,
|
||||
)
|
||||
uncond_weights = np.array(uncond_weights, dtype=uncond_embeddings.dtype)
|
||||
|
||||
# assign weights to the prompts and normalize in the sense of mean
|
||||
# TODO: should we normalize by chunk or in a whole (current implementation)?
|
||||
if (not skip_parsing) and (not skip_weighting):
|
||||
previous_mean = text_embeddings.mean(axis=(-2, -1))
|
||||
text_embeddings *= prompt_weights[:, :, None]
|
||||
text_embeddings *= (previous_mean / text_embeddings.mean(axis=(-2, -1)))[:, None, None]
|
||||
if uncond_prompt is not None:
|
||||
previous_mean = uncond_embeddings.mean(axis=(-2, -1))
|
||||
uncond_embeddings *= uncond_weights[:, :, None]
|
||||
uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=(-2, -1)))[:, None, None]
|
||||
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
if uncond_prompt is not None:
|
||||
return text_embeddings, uncond_embeddings
|
||||
|
||||
return text_embeddings
|
||||
|
||||
|
||||
def preprocess_image(image):
|
||||
w, h = image.size
|
||||
w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
|
||||
image = image.resize((w, h), resample=PIL.Image.LANCZOS)
|
||||
image = np.array(image).astype(np.float32) / 255.0
|
||||
image = image[None].transpose(0, 3, 1, 2)
|
||||
return 2.0 * image - 1.0
|
||||
|
||||
|
||||
def preprocess_mask(mask):
|
||||
mask = mask.convert("L")
|
||||
w, h = mask.size
|
||||
w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
|
||||
mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
|
||||
mask = np.array(mask).astype(np.float32) / 255.0
|
||||
mask = np.tile(mask, (4, 1, 1))
|
||||
mask = mask[None].transpose(0, 1, 2, 3) # what does this step do?
|
||||
mask = 1 - mask # repaint white, keep black
|
||||
return mask
|
||||
|
||||
|
||||
class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
|
||||
weighting in prompt.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vae_encoder: OnnxRuntimeModel,
|
||||
vae_decoder: OnnxRuntimeModel,
|
||||
text_encoder: OnnxRuntimeModel,
|
||||
tokenizer: CLIPTokenizer,
|
||||
unet: OnnxRuntimeModel,
|
||||
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
|
||||
safety_checker: OnnxRuntimeModel,
|
||||
feature_extractor: CLIPFeatureExtractor,
|
||||
):
|
||||
super().__init__()
|
||||
self.register_modules(
|
||||
vae_encoder=vae_encoder,
|
||||
vae_decoder=vae_decoder,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
init_image: Union[np.ndarray, PIL.Image.Image] = None,
|
||||
mask_image: Union[np.ndarray, PIL.Image.Image] = None,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
strength: float = 0.8,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: float = 0.0,
|
||||
generator: Optional[np.random.RandomState] = None,
|
||||
latents: Optional[np.ndarray] = None,
|
||||
max_embeddings_multiples: Optional[int] = 3,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
|
||||
is_cancelled_callback: Optional[Callable[[], bool]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
init_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
mask_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`np.random.RandomState`, *optional*):
|
||||
A np.random.RandomState to make generation deterministic.
|
||||
latents (`np.ndarray`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
||||
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
callback (`Callable`, *optional*):
|
||||
A function that will be called every `callback_steps` steps during inference. The function will be
|
||||
called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
|
||||
is_cancelled_callback (`Callable`, *optional*):
|
||||
A function that will be called every `callback_steps` steps during inference. If the function returns
|
||||
`True`, the inference will be cancelled.
|
||||
callback_steps (`int`, *optional*, defaults to 1):
|
||||
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
||||
called at every step.
|
||||
|
||||
Returns:
|
||||
`None` if cancelled by `is_cancelled_callback`,
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
||||
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
prompt = [prompt]
|
||||
elif isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if strength < 0 or strength > 1:
|
||||
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
|
||||
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
||||
|
||||
if (callback_steps is None) or (
|
||||
callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
|
||||
):
|
||||
raise ValueError(
|
||||
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
|
||||
f" {type(callback_steps)}."
|
||||
)
|
||||
|
||||
# get prompt text embeddings
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
# get unconditional embeddings for classifier free guidance
|
||||
if negative_prompt is None:
|
||||
negative_prompt = [""] * batch_size
|
||||
elif isinstance(negative_prompt, str):
|
||||
negative_prompt = [negative_prompt] * batch_size
|
||||
if batch_size != len(negative_prompt):
|
||||
raise ValueError(
|
||||
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
||||
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
||||
" the batch size of `prompt`."
|
||||
)
|
||||
|
||||
if generator is None:
|
||||
generator = np.random
|
||||
|
||||
text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
|
||||
pipe=self,
|
||||
prompt=prompt,
|
||||
uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
|
||||
max_embeddings_multiples=max_embeddings_multiples,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
text_embeddings = text_embeddings.repeat(num_images_per_prompt, 0)
|
||||
if do_classifier_free_guidance:
|
||||
uncond_embeddings = uncond_embeddings.repeat(num_images_per_prompt, 0)
|
||||
text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
|
||||
|
||||
# set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
|
||||
latents_dtype = text_embeddings.dtype
|
||||
init_latents_orig = None
|
||||
mask = None
|
||||
noise = None
|
||||
|
||||
if init_image is None:
|
||||
latents_shape = (
|
||||
batch_size * num_images_per_prompt,
|
||||
4,
|
||||
height // 8,
|
||||
width // 8,
|
||||
)
|
||||
|
||||
if latents is None:
|
||||
latents = generator.randn(*latents_shape).astype(latents_dtype)
|
||||
elif latents.shape != latents_shape:
|
||||
raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
|
||||
|
||||
timesteps = self.scheduler.timesteps.to(self.device)
|
||||
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
else:
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess_image(init_image)
|
||||
# encode the init image into latents and scale the latents
|
||||
init_image = init_image.astype(latents_dtype)
|
||||
init_latents = self.vae_encoder(sample=init_image)[0]
|
||||
init_latents = 0.18215 * init_latents
|
||||
init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt)
|
||||
init_latents_orig = init_latents
|
||||
|
||||
# preprocess mask
|
||||
if mask_image is not None:
|
||||
if isinstance(mask_image, PIL.Image.Image):
|
||||
mask_image = preprocess_mask(mask_image)
|
||||
mask_image = mask_image.astype(latents_dtype)
|
||||
mask = np.concatenate([mask_image] * batch_size * num_images_per_prompt)
|
||||
|
||||
# check sizes
|
||||
if not mask.shape == init_latents.shape:
|
||||
print(mask.shape, init_latents.shape)
|
||||
raise ValueError("The mask and init_image should be the same size!")
|
||||
|
||||
# get the original timestep using init_timestep
|
||||
offset = self.scheduler.config.get("steps_offset", 0)
|
||||
init_timestep = int(num_inference_steps * strength) + offset
|
||||
init_timestep = min(init_timestep, num_inference_steps)
|
||||
|
||||
timesteps = self.scheduler.timesteps[-init_timestep]
|
||||
timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt)
|
||||
|
||||
# add noise to latents using the timesteps
|
||||
noise = generator.randn(*init_latents.shape).astype(latents_dtype)
|
||||
latents = self.scheduler.add_noise(
|
||||
torch.from_numpy(init_latents), torch.from_numpy(noise), timesteps
|
||||
).numpy()
|
||||
|
||||
t_start = max(num_inference_steps - init_timestep + offset, 0)
|
||||
timesteps = self.scheduler.timesteps[t_start:]
|
||||
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
||||
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
||||
# and should be between [0, 1]
|
||||
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
||||
extra_step_kwargs = {}
|
||||
if accepts_eta:
|
||||
extra_step_kwargs["eta"] = eta
|
||||
|
||||
for i, t in enumerate(self.progress_bar(timesteps)):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
# predict the noise residual
|
||||
noise_pred = self.unet(
|
||||
sample=latent_model_input,
|
||||
timestep=np.array([t]),
|
||||
encoder_hidden_states=text_embeddings,
|
||||
)
|
||||
noise_pred = noise_pred[0]
|
||||
|
||||
# perform guidance
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
|
||||
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample.numpy()
|
||||
|
||||
if mask is not None:
|
||||
# masking
|
||||
init_latents_proper = self.scheduler.add_noise(
|
||||
torch.from_numpy(init_latents_orig),
|
||||
torch.from_numpy(noise),
|
||||
torch.tensor([t]),
|
||||
).numpy()
|
||||
latents = (init_latents_proper * mask) + (latents * (1 - mask))
|
||||
|
||||
# call the callback, if provided
|
||||
if i % callback_steps == 0:
|
||||
if callback is not None:
|
||||
callback(i, t, latents)
|
||||
if is_cancelled_callback is not None and is_cancelled_callback():
|
||||
return None
|
||||
|
||||
latents = 1 / 0.18215 * latents
|
||||
# image = self.vae_decoder(latent_sample=latents)[0]
|
||||
# it seems likes there is a problem for using half-precision vae decoder if batchsize>1
|
||||
image = []
|
||||
for i in range(latents.shape[0]):
|
||||
image.append(self.vae_decoder(latent_sample=latents[i : i + 1])[0])
|
||||
image = np.concatenate(image)
|
||||
|
||||
image = np.clip(image / 2 + 0.5, 0, 1)
|
||||
image = image.transpose((0, 2, 3, 1))
|
||||
|
||||
if self.safety_checker is not None:
|
||||
safety_checker_input = self.feature_extractor(
|
||||
self.numpy_to_pil(image), return_tensors="np"
|
||||
).pixel_values.astype(image.dtype)
|
||||
# There will throw an error if use safety_checker directly and batchsize>1
|
||||
images, has_nsfw_concept = [], []
|
||||
for i in range(image.shape[0]):
|
||||
image_i, has_nsfw_concept_i = self.safety_checker(
|
||||
clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
|
||||
)
|
||||
images.append(image_i)
|
||||
has_nsfw_concept.append(has_nsfw_concept_i)
|
||||
image = np.concatenate(images)
|
||||
else:
|
||||
has_nsfw_concept = None
|
||||
|
||||
if output_type == "pil":
|
||||
image = self.numpy_to_pil(image)
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
||||
|
||||
def text2img(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: float = 0.0,
|
||||
generator: Optional[np.random.RandomState] = None,
|
||||
latents: Optional[np.ndarray] = None,
|
||||
max_embeddings_multiples: Optional[int] = 3,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function for text-to-image generation.
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`np.random.RandomState`, *optional*):
|
||||
A np.random.RandomState to make generation deterministic.
|
||||
latents (`np.ndarray`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
||||
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
callback (`Callable`, *optional*):
|
||||
A function that will be called every `callback_steps` steps during inference. The function will be
|
||||
called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
|
||||
callback_steps (`int`, *optional*, defaults to 1):
|
||||
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
||||
called at every step.
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
||||
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
return self.__call__(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
eta=eta,
|
||||
generator=generator,
|
||||
latents=latents,
|
||||
max_embeddings_multiples=max_embeddings_multiples,
|
||||
output_type=output_type,
|
||||
return_dict=return_dict,
|
||||
callback=callback,
|
||||
callback_steps=callback_steps,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def img2img(
|
||||
self,
|
||||
init_image: Union[np.ndarray, PIL.Image.Image],
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: Optional[float] = 0.0,
|
||||
generator: Optional[np.random.RandomState] = None,
|
||||
max_embeddings_multiples: Optional[int] = 3,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function for image-to-image generation.
|
||||
Args:
|
||||
init_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or ndarray representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`np.random.RandomState`, *optional*):
|
||||
A np.random.RandomState to make generation deterministic.
|
||||
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
||||
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
callback (`Callable`, *optional*):
|
||||
A function that will be called every `callback_steps` steps during inference. The function will be
|
||||
called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
|
||||
callback_steps (`int`, *optional*, defaults to 1):
|
||||
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
||||
called at every step.
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
||||
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
return self.__call__(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
init_image=init_image,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
strength=strength,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
eta=eta,
|
||||
generator=generator,
|
||||
max_embeddings_multiples=max_embeddings_multiples,
|
||||
output_type=output_type,
|
||||
return_dict=return_dict,
|
||||
callback=callback,
|
||||
callback_steps=callback_steps,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def inpaint(
|
||||
self,
|
||||
init_image: Union[np.ndarray, PIL.Image.Image],
|
||||
mask_image: Union[np.ndarray, PIL.Image.Image],
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: Optional[float] = 0.0,
|
||||
generator: Optional[np.random.RandomState] = None,
|
||||
max_embeddings_multiples: Optional[int] = 3,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function for inpaint.
|
||||
Args:
|
||||
init_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process. This is the image whose masked region will be inpainted.
|
||||
mask_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
||||
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
||||
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
|
||||
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
||||
the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`np.random.RandomState`, *optional*):
|
||||
A np.random.RandomState to make generation deterministic.
|
||||
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
||||
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
callback (`Callable`, *optional*):
|
||||
A function that will be called every `callback_steps` steps during inference. The function will be
|
||||
called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
|
||||
callback_steps (`int`, *optional*, defaults to 1):
|
||||
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
||||
called at every step.
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
||||
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
return self.__call__(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
strength=strength,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
eta=eta,
|
||||
generator=generator,
|
||||
max_embeddings_multiples=max_embeddings_multiples,
|
||||
output_type=output_type,
|
||||
return_dict=return_dict,
|
||||
callback=callback,
|
||||
callback_steps=callback_steps,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -1,22 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import torch
|
||||
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
|
||||
class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
|
||||
def __init__(self, unet, scheduler):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(unet=unet, scheduler=scheduler)
|
||||
|
||||
def __call__(self):
|
||||
image = torch.randn(
|
||||
(1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
|
||||
)
|
||||
timestep = 1
|
||||
|
||||
model_output = self.unet(image, timestep).sample
|
||||
scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
|
||||
|
||||
return scheduler_output
|
||||
@@ -1,366 +0,0 @@
|
||||
"""
|
||||
modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
|
||||
"""
|
||||
import inspect
|
||||
from typing import Callable, List, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
from diffusers.utils import logging
|
||||
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
class SeedResizeStableDiffusionPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
|
||||
|
||||
Args:
|
||||
vae ([`AutoencoderKL`]):
|
||||
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
||||
text_encoder ([`CLIPTextModel`]):
|
||||
Frozen text-encoder. Stable Diffusion uses the text portion of
|
||||
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
||||
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
||||
tokenizer (`CLIPTokenizer`):
|
||||
Tokenizer of class
|
||||
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
|
||||
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
|
||||
scheduler ([`SchedulerMixin`]):
|
||||
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
|
||||
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
||||
safety_checker ([`StableDiffusionSafetyChecker`]):
|
||||
Classification module that estimates whether generated images could be considered offensive or harmful.
|
||||
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
|
||||
feature_extractor ([`CLIPFeatureExtractor`]):
|
||||
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vae: AutoencoderKL,
|
||||
text_encoder: CLIPTextModel,
|
||||
tokenizer: CLIPTokenizer,
|
||||
unet: UNet2DConditionModel,
|
||||
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
|
||||
safety_checker: StableDiffusionSafetyChecker,
|
||||
feature_extractor: CLIPFeatureExtractor,
|
||||
):
|
||||
super().__init__()
|
||||
self.register_modules(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
|
||||
When this option is enabled, the attention module will split the input tensor in slices, to compute attention
|
||||
in several steps. This is useful to save some memory in exchange for a small speed decrease.
|
||||
|
||||
Args:
|
||||
slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
|
||||
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
|
||||
a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
|
||||
`attention_head_dim` must be a multiple of `slice_size`.
|
||||
"""
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
r"""
|
||||
Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
|
||||
back to computing attention in one step.
|
||||
"""
|
||||
# set slice_size = `None` to disable `attention slicing`
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: float = 0.0,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
latents: Optional[torch.FloatTensor] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
text_embeddings: Optional[torch.FloatTensor] = None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
|
||||
deterministic.
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
callback (`Callable`, *optional*):
|
||||
A function that will be called every `callback_steps` steps during inference. The function will be
|
||||
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
|
||||
callback_steps (`int`, *optional*, defaults to 1):
|
||||
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
||||
called at every step.
|
||||
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
||||
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
||||
|
||||
if (callback_steps is None) or (
|
||||
callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
|
||||
):
|
||||
raise ValueError(
|
||||
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
|
||||
f" {type(callback_steps)}."
|
||||
)
|
||||
|
||||
# get prompt text embeddings
|
||||
text_inputs = self.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_input_ids = text_inputs.input_ids
|
||||
|
||||
if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
|
||||
removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
||||
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
||||
)
|
||||
text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
|
||||
|
||||
if text_embeddings is None:
|
||||
text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
|
||||
|
||||
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
||||
bs_embed, seq_len, _ = text_embeddings.shape
|
||||
text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
|
||||
text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
# get unconditional embeddings for classifier free guidance
|
||||
if do_classifier_free_guidance:
|
||||
uncond_tokens: List[str]
|
||||
if negative_prompt is None:
|
||||
uncond_tokens = [""]
|
||||
elif type(prompt) is not type(negative_prompt):
|
||||
raise TypeError(
|
||||
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
|
||||
f" {type(prompt)}."
|
||||
)
|
||||
elif isinstance(negative_prompt, str):
|
||||
uncond_tokens = [negative_prompt]
|
||||
elif batch_size != len(negative_prompt):
|
||||
raise ValueError(
|
||||
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
||||
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
||||
" the batch size of `prompt`."
|
||||
)
|
||||
else:
|
||||
uncond_tokens = negative_prompt
|
||||
|
||||
max_length = text_input_ids.shape[-1]
|
||||
uncond_input = self.tokenizer(
|
||||
uncond_tokens,
|
||||
padding="max_length",
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
|
||||
|
||||
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
||||
seq_len = uncond_embeddings.shape[1]
|
||||
uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
|
||||
uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
|
||||
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
|
||||
|
||||
# get the initial random noise unless the user supplied it
|
||||
|
||||
# Unlike in other pipelines, latents need to be generated in the target device
|
||||
# for 1-to-1 results reproducibility with the CompVis implementation.
|
||||
# However this currently doesn't work in `mps`.
|
||||
latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
|
||||
latents_shape_reference = (batch_size * num_images_per_prompt, self.unet.in_channels, 64, 64)
|
||||
latents_dtype = text_embeddings.dtype
|
||||
if latents is None:
|
||||
if self.device.type == "mps":
|
||||
# randn does not exist on mps
|
||||
latents_reference = torch.randn(
|
||||
latents_shape_reference, generator=generator, device="cpu", dtype=latents_dtype
|
||||
).to(self.device)
|
||||
latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
|
||||
self.device
|
||||
)
|
||||
else:
|
||||
latents_reference = torch.randn(
|
||||
latents_shape_reference, generator=generator, device=self.device, dtype=latents_dtype
|
||||
)
|
||||
latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
|
||||
else:
|
||||
if latents_reference.shape != latents_shape:
|
||||
raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
|
||||
latents_reference = latents_reference.to(self.device)
|
||||
latents = latents.to(self.device)
|
||||
|
||||
# This is the key part of the pipeline where we
|
||||
# try to ensure that the generated images w/ the same seed
|
||||
# but different sizes actually result in similar images
|
||||
dx = (latents_shape[3] - latents_shape_reference[3]) // 2
|
||||
dy = (latents_shape[2] - latents_shape_reference[2]) // 2
|
||||
w = latents_shape_reference[3] if dx >= 0 else latents_shape_reference[3] + 2 * dx
|
||||
h = latents_shape_reference[2] if dy >= 0 else latents_shape_reference[2] + 2 * dy
|
||||
tx = 0 if dx < 0 else dx
|
||||
ty = 0 if dy < 0 else dy
|
||||
dx = max(-dx, 0)
|
||||
dy = max(-dy, 0)
|
||||
# import pdb
|
||||
# pdb.set_trace()
|
||||
latents[:, :, ty : ty + h, tx : tx + w] = latents_reference[:, :, dy : dy + h, dx : dx + w]
|
||||
|
||||
# set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
|
||||
# Some schedulers like PNDM have timesteps as arrays
|
||||
# It's more optimized to move all timesteps to correct device beforehand
|
||||
timesteps_tensor = self.scheduler.timesteps.to(self.device)
|
||||
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
||||
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
||||
# and should be between [0, 1]
|
||||
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
||||
extra_step_kwargs = {}
|
||||
if accepts_eta:
|
||||
extra_step_kwargs["eta"] = eta
|
||||
|
||||
for i, t in enumerate(self.progress_bar(timesteps_tensor)):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
# predict the noise residual
|
||||
noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
|
||||
|
||||
# perform guidance
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
latents = 1 / 0.18215 * latents
|
||||
image = self.vae.decode(latents).sample
|
||||
|
||||
image = (image / 2 + 0.5).clamp(0, 1)
|
||||
|
||||
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
|
||||
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
||||
|
||||
if self.safety_checker is not None:
|
||||
safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
|
||||
self.device
|
||||
)
|
||||
image, has_nsfw_concept = self.safety_checker(
|
||||
images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
|
||||
)
|
||||
else:
|
||||
has_nsfw_concept = None
|
||||
|
||||
if output_type == "pil":
|
||||
image = self.numpy_to_pil(image)
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
||||
@@ -1,261 +0,0 @@
|
||||
import inspect
|
||||
from typing import Callable, List, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
DiffusionPipeline,
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.utils import logging
|
||||
from transformers import (
|
||||
CLIPFeatureExtractor,
|
||||
CLIPTextModel,
|
||||
CLIPTokenizer,
|
||||
WhisperForConditionalGeneration,
|
||||
WhisperProcessor,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
class SpeechToImagePipeline(DiffusionPipeline):
|
||||
def __init__(
|
||||
self,
|
||||
speech_model: WhisperForConditionalGeneration,
|
||||
speech_processor: WhisperProcessor,
|
||||
vae: AutoencoderKL,
|
||||
text_encoder: CLIPTextModel,
|
||||
tokenizer: CLIPTokenizer,
|
||||
unet: UNet2DConditionModel,
|
||||
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
|
||||
safety_checker: StableDiffusionSafetyChecker,
|
||||
feature_extractor: CLIPFeatureExtractor,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if safety_checker is None:
|
||||
logger.warn(
|
||||
f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
|
||||
" that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
|
||||
" results in services or applications open to the public. Both the diffusers team and Hugging Face"
|
||||
" strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
|
||||
" it only for use-cases that involve analyzing network behavior or auditing its results. For more"
|
||||
" information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
|
||||
)
|
||||
|
||||
self.register_modules(
|
||||
speech_model=speech_model,
|
||||
speech_processor=speech_processor,
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
if slice_size == "auto":
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
audio,
|
||||
sampling_rate=16_000,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: float = 0.0,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
latents: Optional[torch.FloatTensor] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
inputs = self.speech_processor.feature_extractor(
|
||||
audio, return_tensors="pt", sampling_rate=sampling_rate
|
||||
).input_features.to(self.device)
|
||||
predicted_ids = self.speech_model.generate(inputs, max_length=480_000)
|
||||
|
||||
prompt = self.speech_processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[
|
||||
0
|
||||
]
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
||||
|
||||
if (callback_steps is None) or (
|
||||
callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
|
||||
):
|
||||
raise ValueError(
|
||||
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
|
||||
f" {type(callback_steps)}."
|
||||
)
|
||||
|
||||
# get prompt text embeddings
|
||||
text_inputs = self.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_input_ids = text_inputs.input_ids
|
||||
|
||||
if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
|
||||
removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
||||
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
||||
)
|
||||
text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
|
||||
text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
|
||||
|
||||
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
||||
bs_embed, seq_len, _ = text_embeddings.shape
|
||||
text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
|
||||
text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
# get unconditional embeddings for classifier free guidance
|
||||
if do_classifier_free_guidance:
|
||||
uncond_tokens: List[str]
|
||||
if negative_prompt is None:
|
||||
uncond_tokens = [""] * batch_size
|
||||
elif type(prompt) is not type(negative_prompt):
|
||||
raise TypeError(
|
||||
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
|
||||
f" {type(prompt)}."
|
||||
)
|
||||
elif isinstance(negative_prompt, str):
|
||||
uncond_tokens = [negative_prompt]
|
||||
elif batch_size != len(negative_prompt):
|
||||
raise ValueError(
|
||||
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
||||
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
||||
" the batch size of `prompt`."
|
||||
)
|
||||
else:
|
||||
uncond_tokens = negative_prompt
|
||||
|
||||
max_length = text_input_ids.shape[-1]
|
||||
uncond_input = self.tokenizer(
|
||||
uncond_tokens,
|
||||
padding="max_length",
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
|
||||
|
||||
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
||||
seq_len = uncond_embeddings.shape[1]
|
||||
uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
|
||||
uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
|
||||
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
|
||||
|
||||
# get the initial random noise unless the user supplied it
|
||||
|
||||
# Unlike in other pipelines, latents need to be generated in the target device
|
||||
# for 1-to-1 results reproducibility with the CompVis implementation.
|
||||
# However this currently doesn't work in `mps`.
|
||||
latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
|
||||
latents_dtype = text_embeddings.dtype
|
||||
if latents is None:
|
||||
if self.device.type == "mps":
|
||||
# randn does not exist on mps
|
||||
latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
|
||||
self.device
|
||||
)
|
||||
else:
|
||||
latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
|
||||
else:
|
||||
if latents.shape != latents_shape:
|
||||
raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
|
||||
latents = latents.to(self.device)
|
||||
|
||||
# set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
|
||||
# Some schedulers like PNDM have timesteps as arrays
|
||||
# It's more optimized to move all timesteps to correct device beforehand
|
||||
timesteps_tensor = self.scheduler.timesteps.to(self.device)
|
||||
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
||||
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
||||
# and should be between [0, 1]
|
||||
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
||||
extra_step_kwargs = {}
|
||||
if accepts_eta:
|
||||
extra_step_kwargs["eta"] = eta
|
||||
|
||||
for i, t in enumerate(self.progress_bar(timesteps_tensor)):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
# predict the noise residual
|
||||
noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
|
||||
|
||||
# perform guidance
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
latents = 1 / 0.18215 * latents
|
||||
image = self.vae.decode(latents).sample
|
||||
|
||||
image = (image / 2 + 0.5).clamp(0, 1)
|
||||
|
||||
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
|
||||
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
||||
|
||||
if output_type == "pil":
|
||||
image = self.numpy_to_pil(image)
|
||||
|
||||
if not return_dict:
|
||||
return image
|
||||
|
||||
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
|
||||
@@ -1,224 +0,0 @@
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
import PIL.Image
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
DiffusionPipeline,
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
StableDiffusionImg2ImgPipeline,
|
||||
StableDiffusionInpaintPipelineLegacy,
|
||||
StableDiffusionPipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.configuration_utils import FrozenDict
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.utils import deprecate, logging
|
||||
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
class StableDiffusionMegaPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Stable Diffusion.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
|
||||
|
||||
Args:
|
||||
vae ([`AutoencoderKL`]):
|
||||
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
||||
text_encoder ([`CLIPTextModel`]):
|
||||
Frozen text-encoder. Stable Diffusion uses the text portion of
|
||||
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
||||
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
||||
tokenizer (`CLIPTokenizer`):
|
||||
Tokenizer of class
|
||||
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
|
||||
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
|
||||
scheduler ([`SchedulerMixin`]):
|
||||
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
|
||||
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
||||
safety_checker ([`StableDiffusionMegaSafetyChecker`]):
|
||||
Classification module that estimates whether generated images could be considered offensive or harmful.
|
||||
Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
|
||||
feature_extractor ([`CLIPFeatureExtractor`]):
|
||||
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vae: AutoencoderKL,
|
||||
text_encoder: CLIPTextModel,
|
||||
tokenizer: CLIPTokenizer,
|
||||
unet: UNet2DConditionModel,
|
||||
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
|
||||
safety_checker: StableDiffusionSafetyChecker,
|
||||
feature_extractor: CLIPFeatureExtractor,
|
||||
):
|
||||
super().__init__()
|
||||
if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
|
||||
deprecation_message = (
|
||||
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
|
||||
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
|
||||
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
|
||||
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
|
||||
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
|
||||
" file"
|
||||
)
|
||||
deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
|
||||
new_config = dict(scheduler.config)
|
||||
new_config["steps_offset"] = 1
|
||||
scheduler._internal_dict = FrozenDict(new_config)
|
||||
|
||||
self.register_modules(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
@property
|
||||
def components(self) -> Dict[str, Any]:
|
||||
return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
|
||||
When this option is enabled, the attention module will split the input tensor in slices, to compute attention
|
||||
in several steps. This is useful to save some memory in exchange for a small speed decrease.
|
||||
|
||||
Args:
|
||||
slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
|
||||
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
|
||||
a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
|
||||
`attention_head_dim` must be a multiple of `slice_size`.
|
||||
"""
|
||||
if slice_size == "auto":
|
||||
# half the attention head size is usually a good trade-off between
|
||||
# speed and memory
|
||||
slice_size = self.unet.config.attention_head_dim // 2
|
||||
self.unet.set_attention_slice(slice_size)
|
||||
|
||||
def disable_attention_slicing(self):
|
||||
r"""
|
||||
Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
|
||||
back to computing attention in one step.
|
||||
"""
|
||||
# set slice_size = `None` to disable `attention slicing`
|
||||
self.enable_attention_slicing(None)
|
||||
|
||||
@torch.no_grad()
|
||||
def inpaint(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: Optional[float] = 0.0,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
):
|
||||
# For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
|
||||
return StableDiffusionInpaintPipelineLegacy(**self.components)(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
strength=strength,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
negative_prompt=negative_prompt,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
eta=eta,
|
||||
generator=generator,
|
||||
output_type=output_type,
|
||||
return_dict=return_dict,
|
||||
callback=callback,
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def img2img(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: Optional[float] = 0.0,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
# For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
|
||||
return StableDiffusionImg2ImgPipeline(**self.components)(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
strength=strength,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
negative_prompt=negative_prompt,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
eta=eta,
|
||||
generator=generator,
|
||||
output_type=output_type,
|
||||
return_dict=return_dict,
|
||||
callback=callback,
|
||||
callback_steps=callback_steps,
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def text2img(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: float = 0.0,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
latents: Optional[torch.FloatTensor] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
):
|
||||
# For more information on how this function https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline
|
||||
return StableDiffusionPipeline(**self.components)(
|
||||
prompt=prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
negative_prompt=negative_prompt,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
eta=eta,
|
||||
generator=generator,
|
||||
latents=latents,
|
||||
output_type=output_type,
|
||||
return_dict=return_dict,
|
||||
callback=callback,
|
||||
callback_steps=callback_steps,
|
||||
)
|
||||
@@ -1,418 +0,0 @@
|
||||
import inspect
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers.configuration_utils import FrozenDict
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
from diffusers.utils import deprecate, logging
|
||||
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
global_re_wildcard = re.compile(r"__([^_]*)__")
|
||||
|
||||
|
||||
def get_filename(path: str):
|
||||
# this doesn't work on Windows
|
||||
return os.path.basename(path).split(".txt")[0]
|
||||
|
||||
|
||||
def read_wildcard_values(path: str):
|
||||
with open(path, encoding="utf8") as f:
|
||||
return f.read().splitlines()
|
||||
|
||||
|
||||
def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = []):
|
||||
for wildcard_file in wildcard_files:
|
||||
filename = get_filename(wildcard_file)
|
||||
read_values = read_wildcard_values(wildcard_file)
|
||||
if filename not in wildcard_option_dict:
|
||||
wildcard_option_dict[filename] = []
|
||||
wildcard_option_dict[filename].extend(read_values)
|
||||
return wildcard_option_dict
|
||||
|
||||
|
||||
def replace_prompt_with_wildcards(
|
||||
prompt: str, wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = []
|
||||
):
|
||||
new_prompt = prompt
|
||||
|
||||
# get wildcard options
|
||||
wildcard_option_dict = grab_wildcard_values(wildcard_option_dict, wildcard_files)
|
||||
|
||||
for m in global_re_wildcard.finditer(new_prompt):
|
||||
wildcard_value = m.group()
|
||||
replace_value = random.choice(wildcard_option_dict[wildcard_value.strip("__")])
|
||||
new_prompt = new_prompt.replace(wildcard_value, replace_value, 1)
|
||||
|
||||
return new_prompt
|
||||
|
||||
|
||||
@dataclass
|
||||
class WildcardStableDiffusionOutput(StableDiffusionPipelineOutput):
|
||||
prompts: List[str]
|
||||
|
||||
|
||||
class WildcardStableDiffusionPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Example Usage:
|
||||
pipe = WildcardStableDiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
revision="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
|
||||
out = pipe(
|
||||
prompt,
|
||||
wildcard_option_dict={
|
||||
"clothing":["hat", "shirt", "scarf", "beret"]
|
||||
},
|
||||
wildcard_files=["object.txt", "animal.txt"],
|
||||
num_prompt_samples=1
|
||||
)
|
||||
|
||||
|
||||
Pipeline for text-to-image generation with wild cards using Stable Diffusion.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
|
||||
|
||||
Args:
|
||||
vae ([`AutoencoderKL`]):
|
||||
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
||||
text_encoder ([`CLIPTextModel`]):
|
||||
Frozen text-encoder. Stable Diffusion uses the text portion of
|
||||
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
||||
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
||||
tokenizer (`CLIPTokenizer`):
|
||||
Tokenizer of class
|
||||
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
|
||||
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
|
||||
scheduler ([`SchedulerMixin`]):
|
||||
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
|
||||
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
||||
safety_checker ([`StableDiffusionSafetyChecker`]):
|
||||
Classification module that estimates whether generated images could be considered offensive or harmful.
|
||||
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
|
||||
feature_extractor ([`CLIPFeatureExtractor`]):
|
||||
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vae: AutoencoderKL,
|
||||
text_encoder: CLIPTextModel,
|
||||
tokenizer: CLIPTokenizer,
|
||||
unet: UNet2DConditionModel,
|
||||
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
|
||||
safety_checker: StableDiffusionSafetyChecker,
|
||||
feature_extractor: CLIPFeatureExtractor,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
|
||||
deprecation_message = (
|
||||
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
|
||||
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
|
||||
"to update the config accordingly as leaving `steps_offset` might led to incorrect results"
|
||||
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
|
||||
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
|
||||
" file"
|
||||
)
|
||||
deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
|
||||
new_config = dict(scheduler.config)
|
||||
new_config["steps_offset"] = 1
|
||||
scheduler._internal_dict = FrozenDict(new_config)
|
||||
|
||||
if safety_checker is None:
|
||||
logger.warn(
|
||||
f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
|
||||
" that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
|
||||
" results in services or applications open to the public. Both the diffusers team and Hugging Face"
|
||||
" strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
|
||||
" it only for use-cases that involve analyzing network behavior or auditing its results. For more"
|
||||
" information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
|
||||
)
|
||||
|
||||
self.register_modules(
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
eta: float = 0.0,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
latents: Optional[torch.FloatTensor] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
wildcard_option_dict: Dict[str, List[str]] = {},
|
||||
wildcard_files: List[str] = [],
|
||||
num_prompt_samples: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, *optional*, defaults to 7.5):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
||||
[`schedulers.DDIMScheduler`], will be ignored for others.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
|
||||
deterministic.
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
||||
plain tuple.
|
||||
callback (`Callable`, *optional*):
|
||||
A function that will be called every `callback_steps` steps during inference. The function will be
|
||||
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
|
||||
callback_steps (`int`, *optional*, defaults to 1):
|
||||
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
||||
called at every step.
|
||||
wildcard_option_dict (Dict[str, List[str]]):
|
||||
dict with key as `wildcard` and values as a list of possible replacements. For example if a prompt, "A __animal__ sitting on a chair". A wildcard_option_dict can provide possible values for "animal" like this: {"animal":["dog", "cat", "fox"]}
|
||||
wildcard_files: (List[str])
|
||||
List of filenames of txt files for wildcard replacements. For example if a prompt, "A __animal__ sitting on a chair". A file can be provided ["animal.txt"]
|
||||
num_prompt_samples: int
|
||||
Number of times to sample wildcards for each prompt provided
|
||||
|
||||
Returns:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
||||
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
|
||||
When returning a tuple, the first element is a list with the generated images, and the second element is a
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
|
||||
if isinstance(prompt, str):
|
||||
prompt = [
|
||||
replace_prompt_with_wildcards(prompt, wildcard_option_dict, wildcard_files)
|
||||
for i in range(num_prompt_samples)
|
||||
]
|
||||
batch_size = len(prompt)
|
||||
elif isinstance(prompt, list):
|
||||
prompt_list = []
|
||||
for p in prompt:
|
||||
for i in range(num_prompt_samples):
|
||||
prompt_list.append(replace_prompt_with_wildcards(p, wildcard_option_dict, wildcard_files))
|
||||
prompt = prompt_list
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
||||
|
||||
if (callback_steps is None) or (
|
||||
callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
|
||||
):
|
||||
raise ValueError(
|
||||
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
|
||||
f" {type(callback_steps)}."
|
||||
)
|
||||
|
||||
# get prompt text embeddings
|
||||
text_inputs = self.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_input_ids = text_inputs.input_ids
|
||||
|
||||
if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
|
||||
removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
||||
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
||||
)
|
||||
text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
|
||||
text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
|
||||
|
||||
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
||||
bs_embed, seq_len, _ = text_embeddings.shape
|
||||
text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
|
||||
text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
# get unconditional embeddings for classifier free guidance
|
||||
if do_classifier_free_guidance:
|
||||
uncond_tokens: List[str]
|
||||
if negative_prompt is None:
|
||||
uncond_tokens = [""] * batch_size
|
||||
elif type(prompt) is not type(negative_prompt):
|
||||
raise TypeError(
|
||||
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
|
||||
f" {type(prompt)}."
|
||||
)
|
||||
elif isinstance(negative_prompt, str):
|
||||
uncond_tokens = [negative_prompt]
|
||||
elif batch_size != len(negative_prompt):
|
||||
raise ValueError(
|
||||
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
||||
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
||||
" the batch size of `prompt`."
|
||||
)
|
||||
else:
|
||||
uncond_tokens = negative_prompt
|
||||
|
||||
max_length = text_input_ids.shape[-1]
|
||||
uncond_input = self.tokenizer(
|
||||
uncond_tokens,
|
||||
padding="max_length",
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
|
||||
|
||||
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
||||
seq_len = uncond_embeddings.shape[1]
|
||||
uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
|
||||
uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
|
||||
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
|
||||
|
||||
# get the initial random noise unless the user supplied it
|
||||
|
||||
# Unlike in other pipelines, latents need to be generated in the target device
|
||||
# for 1-to-1 results reproducibility with the CompVis implementation.
|
||||
# However this currently doesn't work in `mps`.
|
||||
latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
|
||||
latents_dtype = text_embeddings.dtype
|
||||
if latents is None:
|
||||
if self.device.type == "mps":
|
||||
# randn does not exist on mps
|
||||
latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
|
||||
self.device
|
||||
)
|
||||
else:
|
||||
latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
|
||||
else:
|
||||
if latents.shape != latents_shape:
|
||||
raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
|
||||
latents = latents.to(self.device)
|
||||
|
||||
# set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
|
||||
# Some schedulers like PNDM have timesteps as arrays
|
||||
# It's more optimized to move all timesteps to correct device beforehand
|
||||
timesteps_tensor = self.scheduler.timesteps.to(self.device)
|
||||
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
||||
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
||||
# and should be between [0, 1]
|
||||
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
||||
extra_step_kwargs = {}
|
||||
if accepts_eta:
|
||||
extra_step_kwargs["eta"] = eta
|
||||
|
||||
for i, t in enumerate(self.progress_bar(timesteps_tensor)):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
# predict the noise residual
|
||||
noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
|
||||
|
||||
# perform guidance
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
latents = 1 / 0.18215 * latents
|
||||
image = self.vae.decode(latents).sample
|
||||
|
||||
image = (image / 2 + 0.5).clamp(0, 1)
|
||||
|
||||
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
|
||||
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
||||
|
||||
if self.safety_checker is not None:
|
||||
safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
|
||||
self.device
|
||||
)
|
||||
image, has_nsfw_concept = self.safety_checker(
|
||||
images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
|
||||
)
|
||||
else:
|
||||
has_nsfw_concept = None
|
||||
|
||||
if output_type == "pil":
|
||||
image = self.numpy_to_pil(image)
|
||||
|
||||
if not return_dict:
|
||||
return (image, has_nsfw_concept)
|
||||
|
||||
return WildcardStableDiffusionOutput(images=image, nsfw_content_detected=has_nsfw_concept, prompts=prompt)
|
||||
@@ -4,12 +4,13 @@
|
||||
The `train_dreambooth.py` script shows how to implement the training procedure and adapt it for stable diffusion.
|
||||
|
||||
|
||||
## Running locally with PyTorch
|
||||
## Running locally
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/diffusers.git
|
||||
pip install -U -r requirements.txt
|
||||
```
|
||||
|
||||
@@ -87,7 +88,6 @@ accelerate launch train_dreambooth.py \
|
||||
--max_train_steps=800
|
||||
```
|
||||
|
||||
|
||||
### Training on a 16GB GPU:
|
||||
|
||||
With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.
|
||||
@@ -142,7 +142,7 @@ export CLASS_DIR="path-to-class-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME --use_auth_token \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
@@ -151,7 +151,6 @@ accelerate launch train_dreambooth.py \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--sample_batch_size=1 \
|
||||
--gradient_accumulation_steps=1 --gradient_checkpointing \
|
||||
--learning_rate=5e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
@@ -161,40 +160,7 @@ accelerate launch train_dreambooth.py \
|
||||
--mixed_precision=fp16
|
||||
```
|
||||
|
||||
### Fine-tune text encoder with the UNet.
|
||||
|
||||
The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces.
|
||||
Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
|
||||
|
||||
___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export CLASS_DIR="path-to-class-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
accelerate launch train_dreambooth.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_text_encoder \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--use_8bit_adam \
|
||||
--gradient_checkpointing \
|
||||
--learning_rate=2e-6 \
|
||||
--lr_scheduler="constant" \
|
||||
--lr_warmup_steps=0 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
|
||||
### Inference
|
||||
## Inference
|
||||
|
||||
Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.
|
||||
|
||||
@@ -210,85 +176,3 @@ image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
|
||||
|
||||
image.save("dog-bucket.png")
|
||||
```
|
||||
|
||||
|
||||
## Running with Flax/JAX
|
||||
|
||||
For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
|
||||
|
||||
____Note: The flax example don't yet support features like gradient checkpoint, gradient accumulation etc, so to use flax for faster training we will need >30GB cards.___
|
||||
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
```bash
|
||||
pip install -U -r requirements_flax.txt
|
||||
```
|
||||
|
||||
|
||||
### Training without prior preservation loss
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
python train_dreambooth_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--max_train_steps=400
|
||||
```
|
||||
|
||||
|
||||
### Training with prior preservation loss
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export CLASS_DIR="path-to-class-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
python train_dreambooth_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
|
||||
|
||||
### Fine-tune text encoder with the UNet.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export INSTANCE_DIR="path-to-instance-images"
|
||||
export CLASS_DIR="path-to-class-images"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
python train_dreambooth_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_text_encoder \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--class_data_dir=$CLASS_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--with_prior_preservation --prior_loss_weight=1.0 \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--class_prompt="a photo of dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--learning_rate=2e-6 \
|
||||
--num_class_images=200 \
|
||||
--max_train_steps=800
|
||||
```
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
diffusers>==0.5.0
|
||||
accelerate
|
||||
torchvision
|
||||
transformers>=4.21.0
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
diffusers>==0.5.1
|
||||
transformers>=4.21.0
|
||||
flax
|
||||
optax
|
||||
torch
|
||||
torchvision
|
||||
ftfy
|
||||
tensorboard
|
||||
modelcards
|
||||
@@ -1,6 +1,4 @@
|
||||
import argparse
|
||||
import hashlib
|
||||
import itertools
|
||||
import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
@@ -26,7 +24,7 @@ from transformers import CLIPTextModel, CLIPTokenizer
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def parse_args(input_args=None):
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
||||
parser.add_argument(
|
||||
"--pretrained_model_name_or_path",
|
||||
@@ -35,13 +33,6 @@ def parse_args(input_args=None):
|
||||
required=True,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--revision",
|
||||
type=str,
|
||||
default=None,
|
||||
required=False,
|
||||
help="Revision of pretrained model identifier from huggingface.co/models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
type=str,
|
||||
@@ -109,7 +100,6 @@ def parse_args(input_args=None):
|
||||
parser.add_argument(
|
||||
"--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
|
||||
)
|
||||
parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
|
||||
parser.add_argument(
|
||||
"--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
|
||||
)
|
||||
@@ -196,11 +186,7 @@ def parse_args(input_args=None):
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
|
||||
if input_args is not None:
|
||||
args = parser.parse_args(input_args)
|
||||
else:
|
||||
args = parser.parse_args()
|
||||
|
||||
args = parser.parse_args()
|
||||
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
|
||||
if env_local_rank != -1 and env_local_rank != args.local_rank:
|
||||
args.local_rank = env_local_rank
|
||||
@@ -323,7 +309,8 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
|
||||
return f"{organization}/{model_id}"
|
||||
|
||||
|
||||
def main(args):
|
||||
def main():
|
||||
args = parse_args()
|
||||
logging_dir = Path(args.output_dir, args.logging_dir)
|
||||
|
||||
accelerator = Accelerator(
|
||||
@@ -333,15 +320,6 @@ def main(args):
|
||||
logging_dir=logging_dir,
|
||||
)
|
||||
|
||||
# Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
|
||||
# This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
|
||||
# TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
|
||||
if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
|
||||
raise ValueError(
|
||||
"Gradient accumulation is not supported when training the text encoder in distributed training. "
|
||||
"Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
|
||||
)
|
||||
|
||||
if args.seed is not None:
|
||||
set_seed(args.seed)
|
||||
|
||||
@@ -354,10 +332,7 @@ def main(args):
|
||||
if cur_class_images < args.num_class_images:
|
||||
torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
torch_dtype=torch_dtype,
|
||||
safety_checker=None,
|
||||
revision=args.revision,
|
||||
args.pretrained_model_name_or_path, torch_dtype=torch_dtype
|
||||
)
|
||||
pipeline.set_progress_bar_config(disable=True)
|
||||
|
||||
@@ -376,9 +351,7 @@ def main(args):
|
||||
images = pipeline(example["prompt"]).images
|
||||
|
||||
for i, image in enumerate(images):
|
||||
hash_image = hashlib.sha1(image.tobytes()).hexdigest()
|
||||
image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
|
||||
image.save(image_filename)
|
||||
image.save(class_images_dir / f"{example['index'][i] + cur_class_images}.jpg")
|
||||
|
||||
del pipeline
|
||||
if torch.cuda.is_available():
|
||||
@@ -403,42 +376,17 @@ def main(args):
|
||||
|
||||
# Load the tokenizer
|
||||
if args.tokenizer_name:
|
||||
tokenizer = CLIPTokenizer.from_pretrained(
|
||||
args.tokenizer_name,
|
||||
revision=args.revision,
|
||||
)
|
||||
tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
|
||||
elif args.pretrained_model_name_or_path:
|
||||
tokenizer = CLIPTokenizer.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
subfolder="tokenizer",
|
||||
revision=args.revision,
|
||||
)
|
||||
tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
|
||||
|
||||
# Load models and create wrapper for stable diffusion
|
||||
text_encoder = CLIPTextModel.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
subfolder="text_encoder",
|
||||
revision=args.revision,
|
||||
)
|
||||
vae = AutoencoderKL.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
subfolder="vae",
|
||||
revision=args.revision,
|
||||
)
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
subfolder="unet",
|
||||
revision=args.revision,
|
||||
)
|
||||
|
||||
vae.requires_grad_(False)
|
||||
if not args.train_text_encoder:
|
||||
text_encoder.requires_grad_(False)
|
||||
text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
|
||||
vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
|
||||
unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
|
||||
|
||||
if args.gradient_checkpointing:
|
||||
unet.enable_gradient_checkpointing()
|
||||
if args.train_text_encoder:
|
||||
text_encoder.gradient_checkpointing_enable()
|
||||
|
||||
if args.scale_lr:
|
||||
args.learning_rate = (
|
||||
@@ -458,18 +406,17 @@ def main(args):
|
||||
else:
|
||||
optimizer_class = torch.optim.AdamW
|
||||
|
||||
params_to_optimize = (
|
||||
itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
|
||||
)
|
||||
optimizer = optimizer_class(
|
||||
params_to_optimize,
|
||||
unet.parameters(), # only optimize unet
|
||||
lr=args.learning_rate,
|
||||
betas=(args.adam_beta1, args.adam_beta2),
|
||||
weight_decay=args.adam_weight_decay,
|
||||
eps=args.adam_epsilon,
|
||||
)
|
||||
|
||||
noise_scheduler = DDPMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
|
||||
noise_scheduler = DDPMScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
|
||||
)
|
||||
|
||||
train_dataset = DreamBoothDataset(
|
||||
instance_data_root=args.instance_data_dir,
|
||||
@@ -494,12 +441,7 @@ def main(args):
|
||||
pixel_values = torch.stack(pixel_values)
|
||||
pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
|
||||
|
||||
input_ids = tokenizer.pad(
|
||||
{"input_ids": input_ids},
|
||||
padding="max_length",
|
||||
max_length=tokenizer.model_max_length,
|
||||
return_tensors="pt",
|
||||
).input_ids
|
||||
input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids
|
||||
|
||||
batch = {
|
||||
"input_ids": input_ids,
|
||||
@@ -508,7 +450,7 @@ def main(args):
|
||||
return batch
|
||||
|
||||
train_dataloader = torch.utils.data.DataLoader(
|
||||
train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn, num_workers=1
|
||||
train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn
|
||||
)
|
||||
|
||||
# Scheduler and math around the number of training steps.
|
||||
@@ -525,14 +467,9 @@ def main(args):
|
||||
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
if args.train_text_encoder:
|
||||
unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
|
||||
unet, text_encoder, optimizer, train_dataloader, lr_scheduler
|
||||
)
|
||||
else:
|
||||
unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
|
||||
unet, optimizer, train_dataloader, lr_scheduler
|
||||
)
|
||||
unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
|
||||
unet, optimizer, train_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
weight_dtype = torch.float32
|
||||
if args.mixed_precision == "fp16":
|
||||
@@ -543,9 +480,8 @@ def main(args):
|
||||
# Move text_encode and vae to gpu.
|
||||
# For mixed precision training we cast the text_encoder and vae weights to half-precision
|
||||
# as these models are only used for inference, keeping weights in full precision is not required.
|
||||
text_encoder.to(accelerator.device, dtype=weight_dtype)
|
||||
vae.to(accelerator.device, dtype=weight_dtype)
|
||||
if not args.train_text_encoder:
|
||||
text_encoder.to(accelerator.device, dtype=weight_dtype)
|
||||
|
||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||
@@ -577,13 +513,12 @@ def main(args):
|
||||
|
||||
for epoch in range(args.num_train_epochs):
|
||||
unet.train()
|
||||
if args.train_text_encoder:
|
||||
text_encoder.train()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
with accelerator.accumulate(unet):
|
||||
# Convert images to latent space
|
||||
latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
|
||||
latents = latents * 0.18215
|
||||
with torch.no_grad():
|
||||
latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
|
||||
latents = latents * 0.18215
|
||||
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn_like(latents)
|
||||
@@ -597,7 +532,8 @@ def main(args):
|
||||
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
|
||||
|
||||
# Get the text embedding for conditioning
|
||||
encoder_hidden_states = text_encoder(batch["input_ids"])[0]
|
||||
with torch.no_grad():
|
||||
encoder_hidden_states = text_encoder(batch["input_ids"])[0]
|
||||
|
||||
# Predict the noise residual
|
||||
noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
|
||||
@@ -620,12 +556,7 @@ def main(args):
|
||||
|
||||
accelerator.backward(loss)
|
||||
if accelerator.sync_gradients:
|
||||
params_to_clip = (
|
||||
itertools.chain(unet.parameters(), text_encoder.parameters())
|
||||
if args.train_text_encoder
|
||||
else unet.parameters()
|
||||
)
|
||||
accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
|
||||
accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
@@ -647,10 +578,7 @@ def main(args):
|
||||
# Create the pipeline using using the trained modules and save it.
|
||||
if accelerator.is_main_process:
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
unet=accelerator.unwrap_model(unet),
|
||||
text_encoder=accelerator.unwrap_model(text_encoder),
|
||||
revision=args.revision,
|
||||
args.pretrained_model_name_or_path, unet=accelerator.unwrap_model(unet)
|
||||
)
|
||||
pipeline.save_pretrained(args.output_dir)
|
||||
|
||||
@@ -661,5 +589,4 @@ def main(args):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
main()
|
||||
|
||||
@@ -1,665 +0,0 @@
|
||||
import argparse
|
||||
import hashlib
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
import jax
|
||||
import jax.numpy as jnp
|
||||
import optax
|
||||
import transformers
|
||||
from diffusers import (
|
||||
FlaxAutoencoderKL,
|
||||
FlaxDDPMScheduler,
|
||||
FlaxPNDMScheduler,
|
||||
FlaxStableDiffusionPipeline,
|
||||
FlaxUNet2DConditionModel,
|
||||
)
|
||||
from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
|
||||
from flax import jax_utils
|
||||
from flax.training import train_state
|
||||
from flax.training.common_utils import shard
|
||||
from huggingface_hub import HfFolder, Repository, whoami
|
||||
from PIL import Image
|
||||
from torchvision import transforms
|
||||
from tqdm.auto import tqdm
|
||||
from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
||||
parser.add_argument(
|
||||
"--pretrained_model_name_or_path",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--instance_data_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="A folder containing the training data of instance images.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--class_data_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
required=False,
|
||||
help="A folder containing the training data of class images.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--instance_prompt",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The prompt with identifier specifying the instance",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--class_prompt",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The prompt to specify images in the same class as provided instance images.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_prior_preservation",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Flag to add prior preservation loss.",
|
||||
)
|
||||
parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
|
||||
parser.add_argument(
|
||||
"--num_class_images",
|
||||
type=int,
|
||||
default=100,
|
||||
help=(
|
||||
"Minimal class images for prior preservation loss. If not have enough images, additional images will be"
|
||||
" sampled with class_prompt."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default="text-inversion-model",
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=0, help="A seed for reproducible training.")
|
||||
parser.add_argument(
|
||||
"--resolution",
|
||||
type=int,
|
||||
default=512,
|
||||
help=(
|
||||
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
|
||||
" resolution"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
|
||||
)
|
||||
parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
|
||||
parser.add_argument(
|
||||
"--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
|
||||
)
|
||||
parser.add_argument("--num_train_epochs", type=int, default=1)
|
||||
parser.add_argument(
|
||||
"--max_train_steps",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
default=5e-6,
|
||||
help="Initial learning rate (after the potential warmup period) to use.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scale_lr",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lr_scheduler",
|
||||
type=str,
|
||||
default="constant",
|
||||
help=(
|
||||
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
|
||||
' "constant", "constant_with_warmup"]'
|
||||
),
|
||||
)
|
||||
parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
|
||||
parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
|
||||
parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
|
||||
parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
|
||||
parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--hub_model_id",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The name of the repository to keep in sync with the local `output_dir`.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help=(
|
||||
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
|
||||
" *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help=(
|
||||
"Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
|
||||
args = parser.parse_args()
|
||||
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
|
||||
if env_local_rank != -1 and env_local_rank != args.local_rank:
|
||||
args.local_rank = env_local_rank
|
||||
|
||||
if args.instance_data_dir is None:
|
||||
raise ValueError("You must specify a train data directory.")
|
||||
|
||||
if args.with_prior_preservation:
|
||||
if args.class_data_dir is None:
|
||||
raise ValueError("You must specify a data directory for class images.")
|
||||
if args.class_prompt is None:
|
||||
raise ValueError("You must specify prompt for class images.")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
class DreamBoothDataset(Dataset):
|
||||
"""
|
||||
A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
|
||||
It pre-processes the images and the tokenizes prompts.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
instance_data_root,
|
||||
instance_prompt,
|
||||
tokenizer,
|
||||
class_data_root=None,
|
||||
class_prompt=None,
|
||||
size=512,
|
||||
center_crop=False,
|
||||
):
|
||||
self.size = size
|
||||
self.center_crop = center_crop
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
self.instance_data_root = Path(instance_data_root)
|
||||
if not self.instance_data_root.exists():
|
||||
raise ValueError("Instance images root doesn't exists.")
|
||||
|
||||
self.instance_images_path = list(Path(instance_data_root).iterdir())
|
||||
self.num_instance_images = len(self.instance_images_path)
|
||||
self.instance_prompt = instance_prompt
|
||||
self._length = self.num_instance_images
|
||||
|
||||
if class_data_root is not None:
|
||||
self.class_data_root = Path(class_data_root)
|
||||
self.class_data_root.mkdir(parents=True, exist_ok=True)
|
||||
self.class_images_path = list(self.class_data_root.iterdir())
|
||||
self.num_class_images = len(self.class_images_path)
|
||||
self._length = max(self.num_class_images, self.num_instance_images)
|
||||
self.class_prompt = class_prompt
|
||||
else:
|
||||
self.class_data_root = None
|
||||
|
||||
self.image_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
]
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return self._length
|
||||
|
||||
def __getitem__(self, index):
|
||||
example = {}
|
||||
instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
|
||||
if not instance_image.mode == "RGB":
|
||||
instance_image = instance_image.convert("RGB")
|
||||
example["instance_images"] = self.image_transforms(instance_image)
|
||||
example["instance_prompt_ids"] = self.tokenizer(
|
||||
self.instance_prompt,
|
||||
padding="do_not_pad",
|
||||
truncation=True,
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
).input_ids
|
||||
|
||||
if self.class_data_root:
|
||||
class_image = Image.open(self.class_images_path[index % self.num_class_images])
|
||||
if not class_image.mode == "RGB":
|
||||
class_image = class_image.convert("RGB")
|
||||
example["class_images"] = self.image_transforms(class_image)
|
||||
example["class_prompt_ids"] = self.tokenizer(
|
||||
self.class_prompt,
|
||||
padding="do_not_pad",
|
||||
truncation=True,
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
).input_ids
|
||||
|
||||
return example
|
||||
|
||||
|
||||
class PromptDataset(Dataset):
|
||||
"A simple dataset to prepare the prompts to generate class images on multiple GPUs."
|
||||
|
||||
def __init__(self, prompt, num_samples):
|
||||
self.prompt = prompt
|
||||
self.num_samples = num_samples
|
||||
|
||||
def __len__(self):
|
||||
return self.num_samples
|
||||
|
||||
def __getitem__(self, index):
|
||||
example = {}
|
||||
example["prompt"] = self.prompt
|
||||
example["index"] = index
|
||||
return example
|
||||
|
||||
|
||||
def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
|
||||
if token is None:
|
||||
token = HfFolder.get_token()
|
||||
if organization is None:
|
||||
username = whoami(token)["name"]
|
||||
return f"{username}/{model_id}"
|
||||
else:
|
||||
return f"{organization}/{model_id}"
|
||||
|
||||
|
||||
def get_params_to_save(params):
|
||||
return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO,
|
||||
)
|
||||
# Setup logging, we only want one process per machine to log things on the screen.
|
||||
logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
|
||||
if jax.process_index() == 0:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
else:
|
||||
transformers.utils.logging.set_verbosity_error()
|
||||
|
||||
if args.seed is not None:
|
||||
set_seed(args.seed)
|
||||
|
||||
if jax.process_index() == 0:
|
||||
if args.push_to_hub:
|
||||
if args.hub_model_id is None:
|
||||
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
repo = Repository(args.output_dir, clone_from=repo_name)
|
||||
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
|
||||
if "step_*" not in gitignore:
|
||||
gitignore.write("step_*\n")
|
||||
if "epoch_*" not in gitignore:
|
||||
gitignore.write("epoch_*\n")
|
||||
elif args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
rng = jax.random.PRNGKey(args.seed)
|
||||
|
||||
if args.with_prior_preservation:
|
||||
class_images_dir = Path(args.class_data_dir)
|
||||
if not class_images_dir.exists():
|
||||
class_images_dir.mkdir(parents=True)
|
||||
cur_class_images = len(list(class_images_dir.iterdir()))
|
||||
|
||||
if cur_class_images < args.num_class_images:
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
|
||||
args.pretrained_model_name_or_path, safety_checker=None
|
||||
)
|
||||
pipeline.set_progress_bar_config(disable=True)
|
||||
|
||||
num_new_images = args.num_class_images - cur_class_images
|
||||
logger.info(f"Number of class images to sample: {num_new_images}.")
|
||||
|
||||
sample_dataset = PromptDataset(args.class_prompt, num_new_images)
|
||||
sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
|
||||
|
||||
for example in tqdm(
|
||||
sample_dataloader, desc="Generating class images", disable=not jax.process_index() == 0
|
||||
):
|
||||
prompt_ids = pipeline.prepare_inputs(example["prompt"])
|
||||
prompt_ids = shard(prompt_ids)
|
||||
p_params = jax_utils.replicate(params)
|
||||
rng = jax.random.split(rng)[0]
|
||||
sample_rng = jax.random.split(rng, jax.device_count())
|
||||
images = pipeline(prompt_ids, p_params, sample_rng, jit=True).images
|
||||
images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
|
||||
images = pipeline.numpy_to_pil(np.array(images))
|
||||
|
||||
for i, image in enumerate(images):
|
||||
hash_image = hashlib.sha1(image.tobytes()).hexdigest()
|
||||
image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
|
||||
image.save(image_filename)
|
||||
|
||||
del pipeline
|
||||
|
||||
# Handle the repository creation
|
||||
if jax.process_index() == 0:
|
||||
if args.push_to_hub:
|
||||
if args.hub_model_id is None:
|
||||
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
repo = Repository(args.output_dir, clone_from=repo_name)
|
||||
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
|
||||
if "step_*" not in gitignore:
|
||||
gitignore.write("step_*\n")
|
||||
if "epoch_*" not in gitignore:
|
||||
gitignore.write("epoch_*\n")
|
||||
elif args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Load the tokenizer and add the placeholder token as a additional special token
|
||||
if args.tokenizer_name:
|
||||
tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
|
||||
elif args.pretrained_model_name_or_path:
|
||||
tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
|
||||
|
||||
train_dataset = DreamBoothDataset(
|
||||
instance_data_root=args.instance_data_dir,
|
||||
instance_prompt=args.instance_prompt,
|
||||
class_data_root=args.class_data_dir if args.with_prior_preservation else None,
|
||||
class_prompt=args.class_prompt,
|
||||
tokenizer=tokenizer,
|
||||
size=args.resolution,
|
||||
center_crop=args.center_crop,
|
||||
)
|
||||
|
||||
def collate_fn(examples):
|
||||
input_ids = [example["instance_prompt_ids"] for example in examples]
|
||||
pixel_values = [example["instance_images"] for example in examples]
|
||||
|
||||
# Concat class and instance examples for prior preservation.
|
||||
# We do this to avoid doing two forward passes.
|
||||
if args.with_prior_preservation:
|
||||
input_ids += [example["class_prompt_ids"] for example in examples]
|
||||
pixel_values += [example["class_images"] for example in examples]
|
||||
|
||||
pixel_values = torch.stack(pixel_values)
|
||||
pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
|
||||
|
||||
input_ids = tokenizer.pad(
|
||||
{"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
|
||||
).input_ids
|
||||
|
||||
batch = {
|
||||
"input_ids": input_ids,
|
||||
"pixel_values": pixel_values,
|
||||
}
|
||||
batch = {k: v.numpy() for k, v in batch.items()}
|
||||
return batch
|
||||
|
||||
total_train_batch_size = args.train_batch_size * jax.local_device_count()
|
||||
train_dataloader = torch.utils.data.DataLoader(
|
||||
train_dataset, batch_size=total_train_batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True
|
||||
)
|
||||
|
||||
weight_dtype = jnp.float32
|
||||
if args.mixed_precision == "fp16":
|
||||
weight_dtype = jnp.float16
|
||||
elif args.mixed_precision == "bf16":
|
||||
weight_dtype = jnp.bfloat16
|
||||
|
||||
# Load models and create wrapper for stable diffusion
|
||||
text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", dtype=weight_dtype)
|
||||
vae, vae_params = FlaxAutoencoderKL.from_pretrained(
|
||||
args.pretrained_model_name_or_path, subfolder="vae", dtype=weight_dtype
|
||||
)
|
||||
unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
|
||||
args.pretrained_model_name_or_path, subfolder="unet", dtype=weight_dtype
|
||||
)
|
||||
|
||||
# Optimization
|
||||
if args.scale_lr:
|
||||
args.learning_rate = args.learning_rate * total_train_batch_size
|
||||
|
||||
constant_scheduler = optax.constant_schedule(args.learning_rate)
|
||||
|
||||
adamw = optax.adamw(
|
||||
learning_rate=constant_scheduler,
|
||||
b1=args.adam_beta1,
|
||||
b2=args.adam_beta2,
|
||||
eps=args.adam_epsilon,
|
||||
weight_decay=args.adam_weight_decay,
|
||||
)
|
||||
|
||||
optimizer = optax.chain(
|
||||
optax.clip_by_global_norm(args.max_grad_norm),
|
||||
adamw,
|
||||
)
|
||||
|
||||
unet_state = train_state.TrainState.create(apply_fn=unet.__call__, params=unet_params, tx=optimizer)
|
||||
text_encoder_state = train_state.TrainState.create(
|
||||
apply_fn=text_encoder.__call__, params=text_encoder.params, tx=optimizer
|
||||
)
|
||||
|
||||
noise_scheduler = FlaxDDPMScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
|
||||
)
|
||||
|
||||
# Initialize our training
|
||||
train_rngs = jax.random.split(rng, jax.local_device_count())
|
||||
|
||||
def train_step(unet_state, text_encoder_state, vae_params, batch, train_rng):
|
||||
dropout_rng, sample_rng, new_train_rng = jax.random.split(train_rng, 3)
|
||||
|
||||
if args.train_text_encoder:
|
||||
params = {"text_encoder": text_encoder_state.params, "unet": unet_state.params}
|
||||
else:
|
||||
params = {"unet": unet_state.params}
|
||||
|
||||
def compute_loss(params):
|
||||
# Convert images to latent space
|
||||
vae_outputs = vae.apply(
|
||||
{"params": vae_params}, batch["pixel_values"], deterministic=True, method=vae.encode
|
||||
)
|
||||
latents = vae_outputs.latent_dist.sample(sample_rng)
|
||||
# (NHWC) -> (NCHW)
|
||||
latents = jnp.transpose(latents, (0, 3, 1, 2))
|
||||
latents = latents * 0.18215
|
||||
|
||||
# Sample noise that we'll add to the latents
|
||||
noise_rng, timestep_rng = jax.random.split(sample_rng)
|
||||
noise = jax.random.normal(noise_rng, latents.shape)
|
||||
# Sample a random timestep for each image
|
||||
bsz = latents.shape[0]
|
||||
timesteps = jax.random.randint(
|
||||
timestep_rng,
|
||||
(bsz,),
|
||||
0,
|
||||
noise_scheduler.config.num_train_timesteps,
|
||||
)
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
|
||||
|
||||
# Get the text embedding for conditioning
|
||||
if args.train_text_encoder:
|
||||
encoder_hidden_states = text_encoder_state.apply_fn(
|
||||
batch["input_ids"], params=params["text_encoder"], dropout_rng=dropout_rng, train=True
|
||||
)[0]
|
||||
else:
|
||||
encoder_hidden_states = text_encoder(
|
||||
batch["input_ids"], params=text_encoder_state.params, train=False
|
||||
)[0]
|
||||
|
||||
# Predict the noise residual
|
||||
unet_outputs = unet.apply(
|
||||
{"params": params["unet"]}, noisy_latents, timesteps, encoder_hidden_states, train=True
|
||||
)
|
||||
noise_pred = unet_outputs.sample
|
||||
|
||||
if args.with_prior_preservation:
|
||||
# Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
|
||||
noise_pred, noise_pred_prior = jnp.split(noise_pred, 2, axis=0)
|
||||
noise, noise_prior = jnp.split(noise, 2, axis=0)
|
||||
|
||||
# Compute instance loss
|
||||
loss = (noise - noise_pred) ** 2
|
||||
loss = loss.mean()
|
||||
|
||||
# Compute prior loss
|
||||
prior_loss = (noise_prior - noise_pred_prior) ** 2
|
||||
prior_loss = prior_loss.mean()
|
||||
|
||||
# Add the prior loss to the instance loss.
|
||||
loss = loss + args.prior_loss_weight * prior_loss
|
||||
else:
|
||||
loss = (noise - noise_pred) ** 2
|
||||
loss = loss.mean()
|
||||
|
||||
return loss
|
||||
|
||||
grad_fn = jax.value_and_grad(compute_loss)
|
||||
loss, grad = grad_fn(params)
|
||||
grad = jax.lax.pmean(grad, "batch")
|
||||
|
||||
new_unet_state = unet_state.apply_gradients(grads=grad["unet"])
|
||||
if args.train_text_encoder:
|
||||
new_text_encoder_state = text_encoder_state.apply_gradients(grads=grad["text_encoder"])
|
||||
else:
|
||||
new_text_encoder_state = text_encoder_state
|
||||
|
||||
metrics = {"loss": loss}
|
||||
metrics = jax.lax.pmean(metrics, axis_name="batch")
|
||||
|
||||
return new_unet_state, new_text_encoder_state, metrics, new_train_rng
|
||||
|
||||
# Create parallel version of the train step
|
||||
p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0, 1))
|
||||
|
||||
# Replicate the train state on each device
|
||||
unet_state = jax_utils.replicate(unet_state)
|
||||
text_encoder_state = jax_utils.replicate(text_encoder_state)
|
||||
vae_params = jax_utils.replicate(vae_params)
|
||||
|
||||
# Train!
|
||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader))
|
||||
|
||||
# Scheduler and math around the number of training steps.
|
||||
if args.max_train_steps is None:
|
||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||
|
||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(f" Num examples = {len(train_dataset)}")
|
||||
logger.info(f" Num Epochs = {args.num_train_epochs}")
|
||||
logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
|
||||
logger.info(f" Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
|
||||
logger.info(f" Total optimization steps = {args.max_train_steps}")
|
||||
|
||||
global_step = 0
|
||||
|
||||
epochs = tqdm(range(args.num_train_epochs), desc="Epoch ... ", position=0)
|
||||
for epoch in epochs:
|
||||
# ======================== Training ================================
|
||||
|
||||
train_metrics = []
|
||||
|
||||
steps_per_epoch = len(train_dataset) // total_train_batch_size
|
||||
train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
|
||||
# train
|
||||
for batch in train_dataloader:
|
||||
batch = shard(batch)
|
||||
unet_state, text_encoder_state, train_metric, train_rngs = p_train_step(
|
||||
unet_state, text_encoder_state, vae_params, batch, train_rngs
|
||||
)
|
||||
train_metrics.append(train_metric)
|
||||
|
||||
train_step_progress_bar.update(1)
|
||||
|
||||
global_step += 1
|
||||
if global_step >= args.max_train_steps:
|
||||
break
|
||||
|
||||
train_metric = jax_utils.unreplicate(train_metric)
|
||||
|
||||
train_step_progress_bar.close()
|
||||
epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
|
||||
|
||||
# Create the pipeline using using the trained modules and save it.
|
||||
if jax.process_index() == 0:
|
||||
scheduler = FlaxPNDMScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
|
||||
)
|
||||
safety_checker = FlaxStableDiffusionSafetyChecker.from_pretrained(
|
||||
"CompVis/stable-diffusion-safety-checker", from_pt=True
|
||||
)
|
||||
pipeline = FlaxStableDiffusionPipeline(
|
||||
text_encoder=text_encoder,
|
||||
vae=vae,
|
||||
unet=unet,
|
||||
tokenizer=tokenizer,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
|
||||
)
|
||||
|
||||
pipeline.save_pretrained(
|
||||
args.output_dir,
|
||||
params={
|
||||
"text_encoder": get_params_to_save(text_encoder_state.params),
|
||||
"vae": get_params_to_save(vae_params),
|
||||
"unet": get_params_to_save(unet_state.params),
|
||||
"safety_checker": safety_checker.params,
|
||||
},
|
||||
)
|
||||
|
||||
if args.push_to_hub:
|
||||
repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -101,7 +101,7 @@ class ExamplesTestsAccelerate(unittest.TestCase):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_args = f"""
|
||||
examples/textual_inversion/textual_inversion.py
|
||||
--pretrained_model_name_or_path runwayml/stable-diffusion-v1-5
|
||||
--pretrained_model_name_or_path CompVis/stable-diffusion-v1-4
|
||||
--train_data_dir docs/source/imgs
|
||||
--learnable_property object
|
||||
--placeholder_token <cat-toy>
|
||||
|
||||
@@ -4,10 +4,10 @@ The `train_text_to_image.py` script shows how to fine-tune stable diffusion mode
|
||||
|
||||
___Note___:
|
||||
|
||||
___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset.___
|
||||
___This script is experimental. The script fine-tunes the whole model and often times the model overifits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset.___
|
||||
|
||||
|
||||
## Running locally with PyTorch
|
||||
## Running locally
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
@@ -86,7 +86,6 @@ accelerate launch train_text_to_image.py \
|
||||
--output_dir="sd-pokemon-model"
|
||||
```
|
||||
|
||||
|
||||
Once the training is finished the model will be saved in the `output_dir` specified in the command. In this example it's `sd-pokemon-model`. To load the fine-tuned model for inference just pass that path to `StableDiffusionPipeline`
|
||||
|
||||
|
||||
@@ -100,54 +99,3 @@ pipe.to("cuda")
|
||||
image = pipe(prompt="yoda").images[0]
|
||||
image.save("yoda-pokemon.png")
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Training with Flax/JAX
|
||||
|
||||
For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
|
||||
|
||||
____Note: The flax example don't yet support features like gradient checkpoint, gradient accumulation etc, so to use flax for faster training we will need >30GB cards.___
|
||||
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
```bash
|
||||
pip install -U -r requirements_flax.txt
|
||||
```
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export dataset_name="lambdalabs/pokemon-blip-captions"
|
||||
|
||||
python train_text_to_image_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--dataset_name=$dataset_name \
|
||||
--resolution=512 --center_crop --random_flip \
|
||||
--train_batch_size=1 \
|
||||
--mixed_precision="fp16" \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-05 \
|
||||
--max_grad_norm=1 \
|
||||
--output_dir="sd-pokemon-model"
|
||||
```
|
||||
|
||||
|
||||
To run on your own training files prepare the dataset according to the format required by `datasets`, you can find the instructions for how to do that in this [document](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata).
|
||||
If you wish to use custom loading logic, you should modify the script, we have left pointers for that in the training script.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export TRAIN_DIR="path_to_your_dataset"
|
||||
|
||||
python train_text_to_image_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_data_dir=$TRAIN_DIR \
|
||||
--resolution=512 --center_crop --random_flip \
|
||||
--train_batch_size=1 \
|
||||
--mixed_precision="fp16" \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-05 \
|
||||
--max_grad_norm=1 \
|
||||
--output_dir="sd-pokemon-model"
|
||||
```
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
diffusers>==0.5.1
|
||||
transformers>=4.21.0
|
||||
flax
|
||||
optax
|
||||
torch
|
||||
torchvision
|
||||
ftfy
|
||||
tensorboard
|
||||
modelcards
|
||||
@@ -372,7 +372,11 @@ def main():
|
||||
weight_decay=args.adam_weight_decay,
|
||||
eps=args.adam_epsilon,
|
||||
)
|
||||
noise_scheduler = DDPMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
|
||||
|
||||
# TODO (patil-suraj): load scheduler using args
|
||||
noise_scheduler = DDPMScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, tensor_format="pt"
|
||||
)
|
||||
|
||||
# Get the datasets: you can either provide your own training and evaluation files (see below)
|
||||
# or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
|
||||
@@ -605,7 +609,9 @@ def main():
|
||||
vae=vae,
|
||||
unet=unet,
|
||||
tokenizer=tokenizer,
|
||||
scheduler=PNDMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
|
||||
scheduler=PNDMScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
|
||||
),
|
||||
safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
|
||||
feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
|
||||
)
|
||||
|
||||
@@ -1,558 +0,0 @@
|
||||
import argparse
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
|
||||
import jax
|
||||
import jax.numpy as jnp
|
||||
import optax
|
||||
import transformers
|
||||
from datasets import load_dataset
|
||||
from diffusers import (
|
||||
FlaxAutoencoderKL,
|
||||
FlaxDDPMScheduler,
|
||||
FlaxPNDMScheduler,
|
||||
FlaxStableDiffusionPipeline,
|
||||
FlaxUNet2DConditionModel,
|
||||
)
|
||||
from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
|
||||
from flax import jax_utils
|
||||
from flax.training import train_state
|
||||
from flax.training.common_utils import shard
|
||||
from huggingface_hub import HfFolder, Repository, whoami
|
||||
from torchvision import transforms
|
||||
from tqdm.auto import tqdm
|
||||
from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
||||
parser.add_argument(
|
||||
"--pretrained_model_name_or_path",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
|
||||
" dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
|
||||
" or to a folder containing files that 🤗 Datasets can understand."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_config_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The config of the Dataset, leave as None if there's only one config.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_data_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"A folder containing the training data. Folder contents must follow the structure described in"
|
||||
" https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
|
||||
" must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_column", type=str, default="image", help="The column of the dataset containing an image."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--caption_column",
|
||||
type=str,
|
||||
default="text",
|
||||
help="The column of the dataset containing a caption or a list of captions.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_train_samples",
|
||||
type=int,
|
||||
default=None,
|
||||
help=(
|
||||
"For debugging purposes or quicker training, truncate the number of training examples to this "
|
||||
"value if set."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default="sd-model-finetuned",
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The directory where the downloaded models and datasets will be stored.",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=0, help="A seed for reproducible training.")
|
||||
parser.add_argument(
|
||||
"--resolution",
|
||||
type=int,
|
||||
default=512,
|
||||
help=(
|
||||
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
|
||||
" resolution"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--center_crop",
|
||||
action="store_true",
|
||||
help="Whether to center crop images before resizing to resolution (if not set, random crop will be used)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random_flip",
|
||||
action="store_true",
|
||||
help="whether to randomly flip images horizontally",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
|
||||
)
|
||||
parser.add_argument("--num_train_epochs", type=int, default=100)
|
||||
parser.add_argument(
|
||||
"--max_train_steps",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
default=1e-4,
|
||||
help="Initial learning rate (after the potential warmup period) to use.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scale_lr",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lr_scheduler",
|
||||
type=str,
|
||||
default="constant",
|
||||
help=(
|
||||
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
|
||||
' "constant", "constant_with_warmup"]'
|
||||
),
|
||||
)
|
||||
parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
|
||||
parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
|
||||
parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
|
||||
parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
|
||||
parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--hub_model_id",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The name of the repository to keep in sync with the local `output_dir`.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help=(
|
||||
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
|
||||
" *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report_to",
|
||||
type=str,
|
||||
default="tensorboard",
|
||||
help=(
|
||||
'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
|
||||
' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
|
||||
"Only applicable when `--with_tracking` is passed."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help=(
|
||||
"Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
|
||||
args = parser.parse_args()
|
||||
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
|
||||
if env_local_rank != -1 and env_local_rank != args.local_rank:
|
||||
args.local_rank = env_local_rank
|
||||
|
||||
# Sanity checks
|
||||
if args.dataset_name is None and args.train_data_dir is None:
|
||||
raise ValueError("Need either a dataset name or a training folder.")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
|
||||
if token is None:
|
||||
token = HfFolder.get_token()
|
||||
if organization is None:
|
||||
username = whoami(token)["name"]
|
||||
return f"{username}/{model_id}"
|
||||
else:
|
||||
return f"{organization}/{model_id}"
|
||||
|
||||
|
||||
dataset_name_mapping = {
|
||||
"lambdalabs/pokemon-blip-captions": ("image", "text"),
|
||||
}
|
||||
|
||||
|
||||
def get_params_to_save(params):
|
||||
return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO,
|
||||
)
|
||||
# Setup logging, we only want one process per machine to log things on the screen.
|
||||
logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
|
||||
if jax.process_index() == 0:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
else:
|
||||
transformers.utils.logging.set_verbosity_error()
|
||||
|
||||
if args.seed is not None:
|
||||
set_seed(args.seed)
|
||||
|
||||
# Handle the repository creation
|
||||
if jax.process_index() == 0:
|
||||
if args.push_to_hub:
|
||||
if args.hub_model_id is None:
|
||||
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
repo = Repository(args.output_dir, clone_from=repo_name)
|
||||
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
|
||||
if "step_*" not in gitignore:
|
||||
gitignore.write("step_*\n")
|
||||
if "epoch_*" not in gitignore:
|
||||
gitignore.write("epoch_*\n")
|
||||
elif args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Get the datasets: you can either provide your own training and evaluation files (see below)
|
||||
# or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
|
||||
|
||||
# In distributed training, the load_dataset function guarantees that only one local process can concurrently
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
dataset = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
cache_dir=args.cache_dir,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
if args.train_data_dir is not None:
|
||||
data_files["train"] = os.path.join(args.train_data_dir, "**")
|
||||
dataset = load_dataset(
|
||||
"imagefolder",
|
||||
data_files=data_files,
|
||||
cache_dir=args.cache_dir,
|
||||
)
|
||||
# See more about loading custom images at
|
||||
# https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# We need to tokenize inputs and targets.
|
||||
column_names = dataset["train"].column_names
|
||||
|
||||
# 6. Get the column names for input/target.
|
||||
dataset_columns = dataset_name_mapping.get(args.dataset_name, None)
|
||||
if args.image_column is None:
|
||||
image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
|
||||
else:
|
||||
image_column = args.image_column
|
||||
if image_column not in column_names:
|
||||
raise ValueError(
|
||||
f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
|
||||
)
|
||||
if args.caption_column is None:
|
||||
caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
|
||||
else:
|
||||
caption_column = args.caption_column
|
||||
if caption_column not in column_names:
|
||||
raise ValueError(
|
||||
f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
|
||||
)
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# We need to tokenize input captions and transform the images.
|
||||
def tokenize_captions(examples, is_train=True):
|
||||
captions = []
|
||||
for caption in examples[caption_column]:
|
||||
if isinstance(caption, str):
|
||||
captions.append(caption)
|
||||
elif isinstance(caption, (list, np.ndarray)):
|
||||
# take a random caption if there are multiple
|
||||
captions.append(random.choice(caption) if is_train else caption[0])
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Caption column `{caption_column}` should contain either strings or lists of strings."
|
||||
)
|
||||
inputs = tokenizer(captions, max_length=tokenizer.model_max_length, padding="do_not_pad", truncation=True)
|
||||
input_ids = inputs.input_ids
|
||||
return input_ids
|
||||
|
||||
train_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize((args.resolution, args.resolution), interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
|
||||
transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
]
|
||||
)
|
||||
|
||||
def preprocess_train(examples):
|
||||
images = [image.convert("RGB") for image in examples[image_column]]
|
||||
examples["pixel_values"] = [train_transforms(image) for image in images]
|
||||
examples["input_ids"] = tokenize_captions(examples)
|
||||
|
||||
return examples
|
||||
|
||||
if jax.process_index() == 0:
|
||||
if args.max_train_samples is not None:
|
||||
dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
|
||||
# Set the training transforms
|
||||
train_dataset = dataset["train"].with_transform(preprocess_train)
|
||||
|
||||
def collate_fn(examples):
|
||||
pixel_values = torch.stack([example["pixel_values"] for example in examples])
|
||||
pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
|
||||
input_ids = [example["input_ids"] for example in examples]
|
||||
|
||||
padded_tokens = tokenizer.pad(
|
||||
{"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
|
||||
)
|
||||
batch = {
|
||||
"pixel_values": pixel_values,
|
||||
"input_ids": padded_tokens.input_ids,
|
||||
}
|
||||
batch = {k: v.numpy() for k, v in batch.items()}
|
||||
|
||||
return batch
|
||||
|
||||
total_train_batch_size = args.train_batch_size * jax.local_device_count()
|
||||
train_dataloader = torch.utils.data.DataLoader(
|
||||
train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=total_train_batch_size, drop_last=True
|
||||
)
|
||||
|
||||
weight_dtype = jnp.float32
|
||||
if args.mixed_precision == "fp16":
|
||||
weight_dtype = jnp.float16
|
||||
elif args.mixed_precision == "bf16":
|
||||
weight_dtype = jnp.bfloat16
|
||||
|
||||
# Load models and create wrapper for stable diffusion
|
||||
tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
|
||||
text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", dtype=weight_dtype)
|
||||
vae, vae_params = FlaxAutoencoderKL.from_pretrained(
|
||||
args.pretrained_model_name_or_path, subfolder="vae", dtype=weight_dtype
|
||||
)
|
||||
unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
|
||||
args.pretrained_model_name_or_path, subfolder="unet", dtype=weight_dtype
|
||||
)
|
||||
|
||||
# Optimization
|
||||
if args.scale_lr:
|
||||
args.learning_rate = args.learning_rate * total_train_batch_size
|
||||
|
||||
constant_scheduler = optax.constant_schedule(args.learning_rate)
|
||||
|
||||
adamw = optax.adamw(
|
||||
learning_rate=constant_scheduler,
|
||||
b1=args.adam_beta1,
|
||||
b2=args.adam_beta2,
|
||||
eps=args.adam_epsilon,
|
||||
weight_decay=args.adam_weight_decay,
|
||||
)
|
||||
|
||||
optimizer = optax.chain(
|
||||
optax.clip_by_global_norm(args.max_grad_norm),
|
||||
adamw,
|
||||
)
|
||||
|
||||
state = train_state.TrainState.create(apply_fn=unet.__call__, params=unet_params, tx=optimizer)
|
||||
|
||||
noise_scheduler = FlaxDDPMScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
|
||||
)
|
||||
|
||||
# Initialize our training
|
||||
rng = jax.random.PRNGKey(args.seed)
|
||||
train_rngs = jax.random.split(rng, jax.local_device_count())
|
||||
|
||||
def train_step(state, text_encoder_params, vae_params, batch, train_rng):
|
||||
dropout_rng, sample_rng, new_train_rng = jax.random.split(train_rng, 3)
|
||||
|
||||
def compute_loss(params):
|
||||
# Convert images to latent space
|
||||
vae_outputs = vae.apply(
|
||||
{"params": vae_params}, batch["pixel_values"], deterministic=True, method=vae.encode
|
||||
)
|
||||
latents = vae_outputs.latent_dist.sample(sample_rng)
|
||||
# (NHWC) -> (NCHW)
|
||||
latents = jnp.transpose(latents, (0, 3, 1, 2))
|
||||
latents = latents * 0.18215
|
||||
|
||||
# Sample noise that we'll add to the latents
|
||||
noise_rng, timestep_rng = jax.random.split(sample_rng)
|
||||
noise = jax.random.normal(noise_rng, latents.shape)
|
||||
# Sample a random timestep for each image
|
||||
bsz = latents.shape[0]
|
||||
timesteps = jax.random.randint(
|
||||
timestep_rng,
|
||||
(bsz,),
|
||||
0,
|
||||
noise_scheduler.config.num_train_timesteps,
|
||||
)
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
|
||||
|
||||
# Get the text embedding for conditioning
|
||||
encoder_hidden_states = text_encoder(
|
||||
batch["input_ids"],
|
||||
params=text_encoder_params,
|
||||
train=False,
|
||||
)[0]
|
||||
|
||||
# Predict the noise residual and compute loss
|
||||
unet_outputs = unet.apply({"params": params}, noisy_latents, timesteps, encoder_hidden_states, train=True)
|
||||
noise_pred = unet_outputs.sample
|
||||
loss = (noise - noise_pred) ** 2
|
||||
loss = loss.mean()
|
||||
|
||||
return loss
|
||||
|
||||
grad_fn = jax.value_and_grad(compute_loss)
|
||||
loss, grad = grad_fn(state.params)
|
||||
grad = jax.lax.pmean(grad, "batch")
|
||||
|
||||
new_state = state.apply_gradients(grads=grad)
|
||||
|
||||
metrics = {"loss": loss}
|
||||
metrics = jax.lax.pmean(metrics, axis_name="batch")
|
||||
|
||||
return new_state, metrics, new_train_rng
|
||||
|
||||
# Create parallel version of the train step
|
||||
p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
|
||||
|
||||
# Replicate the train state on each device
|
||||
state = jax_utils.replicate(state)
|
||||
text_encoder_params = jax_utils.replicate(text_encoder.params)
|
||||
vae_params = jax_utils.replicate(vae_params)
|
||||
|
||||
# Train!
|
||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader))
|
||||
|
||||
# Scheduler and math around the number of training steps.
|
||||
if args.max_train_steps is None:
|
||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||
|
||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(f" Num examples = {len(train_dataset)}")
|
||||
logger.info(f" Num Epochs = {args.num_train_epochs}")
|
||||
logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
|
||||
logger.info(f" Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
|
||||
logger.info(f" Total optimization steps = {args.max_train_steps}")
|
||||
|
||||
global_step = 0
|
||||
|
||||
epochs = tqdm(range(args.num_train_epochs), desc="Epoch ... ", position=0)
|
||||
for epoch in epochs:
|
||||
# ======================== Training ================================
|
||||
|
||||
train_metrics = []
|
||||
|
||||
steps_per_epoch = len(train_dataset) // total_train_batch_size
|
||||
train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
|
||||
# train
|
||||
for batch in train_dataloader:
|
||||
batch = shard(batch)
|
||||
state, train_metric, train_rngs = p_train_step(state, text_encoder_params, vae_params, batch, train_rngs)
|
||||
train_metrics.append(train_metric)
|
||||
|
||||
train_step_progress_bar.update(1)
|
||||
|
||||
global_step += 1
|
||||
if global_step >= args.max_train_steps:
|
||||
break
|
||||
|
||||
train_metric = jax_utils.unreplicate(train_metric)
|
||||
|
||||
train_step_progress_bar.close()
|
||||
epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
|
||||
|
||||
# Create the pipeline using using the trained modules and save it.
|
||||
if jax.process_index() == 0:
|
||||
scheduler = FlaxPNDMScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
|
||||
)
|
||||
safety_checker = FlaxStableDiffusionSafetyChecker.from_pretrained(
|
||||
"CompVis/stable-diffusion-safety-checker", from_pt=True
|
||||
)
|
||||
pipeline = FlaxStableDiffusionPipeline(
|
||||
text_encoder=text_encoder,
|
||||
vae=vae,
|
||||
unet=unet,
|
||||
tokenizer=tokenizer,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
|
||||
)
|
||||
|
||||
pipeline.save_pretrained(
|
||||
args.output_dir,
|
||||
params={
|
||||
"text_encoder": get_params_to_save(text_encoder_params),
|
||||
"vae": get_params_to_save(vae_params),
|
||||
"unet": get_params_to_save(state.params),
|
||||
"safety_checker": safety_checker.params,
|
||||
},
|
||||
)
|
||||
|
||||
if args.push_to_hub:
|
||||
repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -11,7 +11,7 @@ Colab for training
|
||||
Colab for inference
|
||||
[](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_conceptualizer_inference.ipynb)
|
||||
|
||||
## Running locally with PyTorch
|
||||
## Running locally
|
||||
### Installing the dependencies
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
@@ -29,7 +29,7 @@ accelerate config
|
||||
|
||||
### Cat toy example
|
||||
|
||||
You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-5`, so you'll need to visit [its card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license and tick the checkbox if you agree.
|
||||
You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree.
|
||||
|
||||
You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
|
||||
|
||||
@@ -48,7 +48,7 @@ Now let's get our dataset.Download 3-4 images from [here](https://drive.google.c
|
||||
And launch the training using
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
export DATA_DIR="path-to-dir-containing-images"
|
||||
|
||||
accelerate launch textual_inversion.py \
|
||||
@@ -68,6 +68,7 @@ accelerate launch textual_inversion.py \
|
||||
|
||||
A full training run takes ~1 hour on one V100 GPU.
|
||||
|
||||
|
||||
### Inference
|
||||
|
||||
Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `placeholder_token` in your prompt.
|
||||
@@ -84,31 +85,3 @@ image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
|
||||
|
||||
image.save("cat-backpack.png")
|
||||
```
|
||||
|
||||
|
||||
## Training with Flax/JAX
|
||||
|
||||
For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
|
||||
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
```bash
|
||||
pip install -U -r requirements_flax.txt
|
||||
```
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export DATA_DIR="path-to-dir-containing-images"
|
||||
|
||||
python textual_inversion_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_data_dir=$DATA_DIR \
|
||||
--learnable_property="object" \
|
||||
--placeholder_token="<cat-toy>" --initializer_token="toy" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--max_train_steps=3000 \
|
||||
--learning_rate=5.0e-04 --scale_lr \
|
||||
--output_dir="textual_inversion_cat"
|
||||
```
|
||||
It should be at least 70% faster than the PyTorch script with the same configuration.
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
diffusers>==0.5.1
|
||||
transformers>=4.21.0
|
||||
flax
|
||||
optax
|
||||
torch
|
||||
torchvision
|
||||
ftfy
|
||||
tensorboard
|
||||
modelcards
|
||||
@@ -419,7 +419,13 @@ def main():
|
||||
eps=args.adam_epsilon,
|
||||
)
|
||||
|
||||
noise_scheduler = DDPMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
|
||||
# TODO (patil-suraj): load scheduler using args
|
||||
noise_scheduler = DDPMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
num_train_timesteps=1000,
|
||||
)
|
||||
|
||||
train_dataset = TextualInversionDataset(
|
||||
data_root=args.train_data_dir,
|
||||
@@ -552,7 +558,9 @@ def main():
|
||||
vae=vae,
|
||||
unet=unet,
|
||||
tokenizer=tokenizer,
|
||||
scheduler=PNDMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
|
||||
scheduler=PNDMScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
|
||||
),
|
||||
safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
|
||||
feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
|
||||
)
|
||||
|
||||
@@ -1,629 +0,0 @@
|
||||
import argparse
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
import jax
|
||||
import jax.numpy as jnp
|
||||
import optax
|
||||
import PIL
|
||||
import transformers
|
||||
from diffusers import (
|
||||
FlaxAutoencoderKL,
|
||||
FlaxDDPMScheduler,
|
||||
FlaxPNDMScheduler,
|
||||
FlaxStableDiffusionPipeline,
|
||||
FlaxUNet2DConditionModel,
|
||||
)
|
||||
from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
|
||||
from flax import jax_utils
|
||||
from flax.training import train_state
|
||||
from flax.training.common_utils import shard
|
||||
from huggingface_hub import HfFolder, Repository, whoami
|
||||
from PIL import Image
|
||||
from torchvision import transforms
|
||||
from tqdm.auto import tqdm
|
||||
from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
||||
parser.add_argument(
|
||||
"--pretrained_model_name_or_path",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--placeholder_token",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="A token to use as a placeholder for the concept.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
|
||||
)
|
||||
parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
|
||||
parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default="text-inversion-model",
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
|
||||
parser.add_argument(
|
||||
"--resolution",
|
||||
type=int,
|
||||
default=512,
|
||||
help=(
|
||||
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
|
||||
" resolution"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
|
||||
)
|
||||
parser.add_argument("--num_train_epochs", type=int, default=100)
|
||||
parser.add_argument(
|
||||
"--max_train_steps",
|
||||
type=int,
|
||||
default=5000,
|
||||
help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
default=1e-4,
|
||||
help="Initial learning rate (after the potential warmup period) to use.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scale_lr",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lr_scheduler",
|
||||
type=str,
|
||||
default="constant",
|
||||
help=(
|
||||
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
|
||||
' "constant", "constant_with_warmup"]'
|
||||
),
|
||||
)
|
||||
parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
|
||||
parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
|
||||
parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
|
||||
parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
|
||||
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
|
||||
parser.add_argument(
|
||||
"--use_auth_token",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Will use the token generated when running `huggingface-cli login` (necessary to use this script with"
|
||||
" private models)."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--hub_model_id",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The name of the repository to keep in sync with the local `output_dir`.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help=(
|
||||
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
|
||||
" *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
|
||||
args = parser.parse_args()
|
||||
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
|
||||
if env_local_rank != -1 and env_local_rank != args.local_rank:
|
||||
args.local_rank = env_local_rank
|
||||
|
||||
if args.train_data_dir is None:
|
||||
raise ValueError("You must specify a train data directory.")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
imagenet_templates_small = [
|
||||
"a photo of a {}",
|
||||
"a rendering of a {}",
|
||||
"a cropped photo of the {}",
|
||||
"the photo of a {}",
|
||||
"a photo of a clean {}",
|
||||
"a photo of a dirty {}",
|
||||
"a dark photo of the {}",
|
||||
"a photo of my {}",
|
||||
"a photo of the cool {}",
|
||||
"a close-up photo of a {}",
|
||||
"a bright photo of the {}",
|
||||
"a cropped photo of a {}",
|
||||
"a photo of the {}",
|
||||
"a good photo of the {}",
|
||||
"a photo of one {}",
|
||||
"a close-up photo of the {}",
|
||||
"a rendition of the {}",
|
||||
"a photo of the clean {}",
|
||||
"a rendition of a {}",
|
||||
"a photo of a nice {}",
|
||||
"a good photo of a {}",
|
||||
"a photo of the nice {}",
|
||||
"a photo of the small {}",
|
||||
"a photo of the weird {}",
|
||||
"a photo of the large {}",
|
||||
"a photo of a cool {}",
|
||||
"a photo of a small {}",
|
||||
]
|
||||
|
||||
imagenet_style_templates_small = [
|
||||
"a painting in the style of {}",
|
||||
"a rendering in the style of {}",
|
||||
"a cropped painting in the style of {}",
|
||||
"the painting in the style of {}",
|
||||
"a clean painting in the style of {}",
|
||||
"a dirty painting in the style of {}",
|
||||
"a dark painting in the style of {}",
|
||||
"a picture in the style of {}",
|
||||
"a cool painting in the style of {}",
|
||||
"a close-up painting in the style of {}",
|
||||
"a bright painting in the style of {}",
|
||||
"a cropped painting in the style of {}",
|
||||
"a good painting in the style of {}",
|
||||
"a close-up painting in the style of {}",
|
||||
"a rendition in the style of {}",
|
||||
"a nice painting in the style of {}",
|
||||
"a small painting in the style of {}",
|
||||
"a weird painting in the style of {}",
|
||||
"a large painting in the style of {}",
|
||||
]
|
||||
|
||||
|
||||
class TextualInversionDataset(Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
data_root,
|
||||
tokenizer,
|
||||
learnable_property="object", # [object, style]
|
||||
size=512,
|
||||
repeats=100,
|
||||
interpolation="bicubic",
|
||||
flip_p=0.5,
|
||||
set="train",
|
||||
placeholder_token="*",
|
||||
center_crop=False,
|
||||
):
|
||||
self.data_root = data_root
|
||||
self.tokenizer = tokenizer
|
||||
self.learnable_property = learnable_property
|
||||
self.size = size
|
||||
self.placeholder_token = placeholder_token
|
||||
self.center_crop = center_crop
|
||||
self.flip_p = flip_p
|
||||
|
||||
self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
|
||||
|
||||
self.num_images = len(self.image_paths)
|
||||
self._length = self.num_images
|
||||
|
||||
if set == "train":
|
||||
self._length = self.num_images * repeats
|
||||
|
||||
self.interpolation = {
|
||||
"linear": PIL.Image.LINEAR,
|
||||
"bilinear": PIL.Image.BILINEAR,
|
||||
"bicubic": PIL.Image.BICUBIC,
|
||||
"lanczos": PIL.Image.LANCZOS,
|
||||
}[interpolation]
|
||||
|
||||
self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
|
||||
self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
|
||||
|
||||
def __len__(self):
|
||||
return self._length
|
||||
|
||||
def __getitem__(self, i):
|
||||
example = {}
|
||||
image = Image.open(self.image_paths[i % self.num_images])
|
||||
|
||||
if not image.mode == "RGB":
|
||||
image = image.convert("RGB")
|
||||
|
||||
placeholder_string = self.placeholder_token
|
||||
text = random.choice(self.templates).format(placeholder_string)
|
||||
|
||||
example["input_ids"] = self.tokenizer(
|
||||
text,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
return_tensors="pt",
|
||||
).input_ids[0]
|
||||
|
||||
# default to score-sde preprocessing
|
||||
img = np.array(image).astype(np.uint8)
|
||||
|
||||
if self.center_crop:
|
||||
crop = min(img.shape[0], img.shape[1])
|
||||
h, w, = (
|
||||
img.shape[0],
|
||||
img.shape[1],
|
||||
)
|
||||
img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
|
||||
|
||||
image = Image.fromarray(img)
|
||||
image = image.resize((self.size, self.size), resample=self.interpolation)
|
||||
|
||||
image = self.flip_transform(image)
|
||||
image = np.array(image).astype(np.uint8)
|
||||
image = (image / 127.5 - 1.0).astype(np.float32)
|
||||
|
||||
example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
|
||||
return example
|
||||
|
||||
|
||||
def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
|
||||
if token is None:
|
||||
token = HfFolder.get_token()
|
||||
if organization is None:
|
||||
username = whoami(token)["name"]
|
||||
return f"{username}/{model_id}"
|
||||
else:
|
||||
return f"{organization}/{model_id}"
|
||||
|
||||
|
||||
def resize_token_embeddings(model, new_num_tokens, initializer_token_id, placeholder_token_id, rng):
|
||||
if model.config.vocab_size == new_num_tokens or new_num_tokens is None:
|
||||
return
|
||||
model.config.vocab_size = new_num_tokens
|
||||
|
||||
params = model.params
|
||||
old_embeddings = params["text_model"]["embeddings"]["token_embedding"]["embedding"]
|
||||
old_num_tokens, emb_dim = old_embeddings.shape
|
||||
|
||||
initializer = jax.nn.initializers.normal()
|
||||
|
||||
new_embeddings = initializer(rng, (new_num_tokens, emb_dim))
|
||||
new_embeddings = new_embeddings.at[:old_num_tokens].set(old_embeddings)
|
||||
new_embeddings = new_embeddings.at[placeholder_token_id].set(new_embeddings[initializer_token_id])
|
||||
params["text_model"]["embeddings"]["token_embedding"]["embedding"] = new_embeddings
|
||||
|
||||
model.params = params
|
||||
return model
|
||||
|
||||
|
||||
def get_params_to_save(params):
|
||||
return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
if args.seed is not None:
|
||||
set_seed(args.seed)
|
||||
|
||||
if jax.process_index() == 0:
|
||||
if args.push_to_hub:
|
||||
if args.hub_model_id is None:
|
||||
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
repo = Repository(args.output_dir, clone_from=repo_name)
|
||||
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
|
||||
if "step_*" not in gitignore:
|
||||
gitignore.write("step_*\n")
|
||||
if "epoch_*" not in gitignore:
|
||||
gitignore.write("epoch_*\n")
|
||||
elif args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Make one log on every process with the configuration for debugging.
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO,
|
||||
)
|
||||
# Setup logging, we only want one process per machine to log things on the screen.
|
||||
logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
|
||||
if jax.process_index() == 0:
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
else:
|
||||
transformers.utils.logging.set_verbosity_error()
|
||||
|
||||
# Load the tokenizer and add the placeholder token as a additional special token
|
||||
if args.tokenizer_name:
|
||||
tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
|
||||
elif args.pretrained_model_name_or_path:
|
||||
tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
|
||||
|
||||
# Add the placeholder token in tokenizer
|
||||
num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
|
||||
if num_added_tokens == 0:
|
||||
raise ValueError(
|
||||
f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
|
||||
" `placeholder_token` that is not already in the tokenizer."
|
||||
)
|
||||
|
||||
# Convert the initializer_token, placeholder_token to ids
|
||||
token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
|
||||
# Check if initializer_token is a single token or a sequence of tokens
|
||||
if len(token_ids) > 1:
|
||||
raise ValueError("The initializer token must be a single token.")
|
||||
|
||||
initializer_token_id = token_ids[0]
|
||||
placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
|
||||
|
||||
# Load models and create wrapper for stable diffusion
|
||||
text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
|
||||
vae, vae_params = FlaxAutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
|
||||
unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
|
||||
|
||||
# Create sampling rng
|
||||
rng = jax.random.PRNGKey(args.seed)
|
||||
rng, _ = jax.random.split(rng)
|
||||
# Resize the token embeddings as we are adding new special tokens to the tokenizer
|
||||
text_encoder = resize_token_embeddings(
|
||||
text_encoder, len(tokenizer), initializer_token_id, placeholder_token_id, rng
|
||||
)
|
||||
original_token_embeds = text_encoder.params["text_model"]["embeddings"]["token_embedding"]["embedding"]
|
||||
|
||||
train_dataset = TextualInversionDataset(
|
||||
data_root=args.train_data_dir,
|
||||
tokenizer=tokenizer,
|
||||
size=args.resolution,
|
||||
placeholder_token=args.placeholder_token,
|
||||
repeats=args.repeats,
|
||||
learnable_property=args.learnable_property,
|
||||
center_crop=args.center_crop,
|
||||
set="train",
|
||||
)
|
||||
|
||||
def collate_fn(examples):
|
||||
pixel_values = torch.stack([example["pixel_values"] for example in examples])
|
||||
input_ids = torch.stack([example["input_ids"] for example in examples])
|
||||
|
||||
batch = {"pixel_values": pixel_values, "input_ids": input_ids}
|
||||
batch = {k: v.numpy() for k, v in batch.items()}
|
||||
|
||||
return batch
|
||||
|
||||
total_train_batch_size = args.train_batch_size * jax.local_device_count()
|
||||
train_dataloader = torch.utils.data.DataLoader(
|
||||
train_dataset, batch_size=total_train_batch_size, shuffle=True, drop_last=True, collate_fn=collate_fn
|
||||
)
|
||||
|
||||
# Optimization
|
||||
if args.scale_lr:
|
||||
args.learning_rate = args.learning_rate * total_train_batch_size
|
||||
|
||||
constant_scheduler = optax.constant_schedule(args.learning_rate)
|
||||
|
||||
optimizer = optax.adamw(
|
||||
learning_rate=constant_scheduler,
|
||||
b1=args.adam_beta1,
|
||||
b2=args.adam_beta2,
|
||||
eps=args.adam_epsilon,
|
||||
weight_decay=args.adam_weight_decay,
|
||||
)
|
||||
|
||||
def create_mask(params, label_fn):
|
||||
def _map(params, mask, label_fn):
|
||||
for k in params:
|
||||
if label_fn(k):
|
||||
mask[k] = "token_embedding"
|
||||
else:
|
||||
if isinstance(params[k], dict):
|
||||
mask[k] = {}
|
||||
_map(params[k], mask[k], label_fn)
|
||||
else:
|
||||
mask[k] = "zero"
|
||||
|
||||
mask = {}
|
||||
_map(params, mask, label_fn)
|
||||
return mask
|
||||
|
||||
def zero_grads():
|
||||
# from https://github.com/deepmind/optax/issues/159#issuecomment-896459491
|
||||
def init_fn(_):
|
||||
return ()
|
||||
|
||||
def update_fn(updates, state, params=None):
|
||||
return jax.tree_util.tree_map(jnp.zeros_like, updates), ()
|
||||
|
||||
return optax.GradientTransformation(init_fn, update_fn)
|
||||
|
||||
# Zero out gradients of layers other than the token embedding layer
|
||||
tx = optax.multi_transform(
|
||||
{"token_embedding": optimizer, "zero": zero_grads()},
|
||||
create_mask(text_encoder.params, lambda s: s == "token_embedding"),
|
||||
)
|
||||
|
||||
state = train_state.TrainState.create(apply_fn=text_encoder.__call__, params=text_encoder.params, tx=tx)
|
||||
|
||||
noise_scheduler = FlaxDDPMScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
|
||||
)
|
||||
|
||||
# Initialize our training
|
||||
train_rngs = jax.random.split(rng, jax.local_device_count())
|
||||
|
||||
# Define gradient train step fn
|
||||
def train_step(state, vae_params, unet_params, batch, train_rng):
|
||||
dropout_rng, sample_rng, new_train_rng = jax.random.split(train_rng, 3)
|
||||
|
||||
def compute_loss(params):
|
||||
vae_outputs = vae.apply(
|
||||
{"params": vae_params}, batch["pixel_values"], deterministic=True, method=vae.encode
|
||||
)
|
||||
latents = vae_outputs.latent_dist.sample(sample_rng)
|
||||
# (NHWC) -> (NCHW)
|
||||
latents = jnp.transpose(latents, (0, 3, 1, 2))
|
||||
latents = latents * 0.18215
|
||||
|
||||
noise_rng, timestep_rng = jax.random.split(sample_rng)
|
||||
noise = jax.random.normal(noise_rng, latents.shape)
|
||||
bsz = latents.shape[0]
|
||||
timesteps = jax.random.randint(
|
||||
timestep_rng,
|
||||
(bsz,),
|
||||
0,
|
||||
noise_scheduler.config.num_train_timesteps,
|
||||
)
|
||||
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
|
||||
encoder_hidden_states = state.apply_fn(
|
||||
batch["input_ids"], params=params, dropout_rng=dropout_rng, train=True
|
||||
)[0]
|
||||
unet_outputs = unet.apply(
|
||||
{"params": unet_params}, noisy_latents, timesteps, encoder_hidden_states, train=False
|
||||
)
|
||||
noise_pred = unet_outputs.sample
|
||||
loss = (noise - noise_pred) ** 2
|
||||
loss = loss.mean()
|
||||
|
||||
return loss
|
||||
|
||||
grad_fn = jax.value_and_grad(compute_loss)
|
||||
loss, grad = grad_fn(state.params)
|
||||
grad = jax.lax.pmean(grad, "batch")
|
||||
new_state = state.apply_gradients(grads=grad)
|
||||
|
||||
# Keep the token embeddings fixed except the newly added embeddings for the concept,
|
||||
# as we only want to optimize the concept embeddings
|
||||
token_embeds = original_token_embeds.at[placeholder_token_id].set(
|
||||
new_state.params["text_model"]["embeddings"]["token_embedding"]["embedding"][placeholder_token_id]
|
||||
)
|
||||
new_state.params["text_model"]["embeddings"]["token_embedding"]["embedding"] = token_embeds
|
||||
|
||||
metrics = {"loss": loss}
|
||||
metrics = jax.lax.pmean(metrics, axis_name="batch")
|
||||
return new_state, metrics, new_train_rng
|
||||
|
||||
# Create parallel version of the train and eval step
|
||||
p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
|
||||
|
||||
# Replicate the train state on each device
|
||||
state = jax_utils.replicate(state)
|
||||
vae_params = jax_utils.replicate(vae_params)
|
||||
unet_params = jax_utils.replicate(unet_params)
|
||||
|
||||
# Train!
|
||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader))
|
||||
|
||||
# Scheduler and math around the number of training steps.
|
||||
if args.max_train_steps is None:
|
||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||
|
||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(f" Num examples = {len(train_dataset)}")
|
||||
logger.info(f" Num Epochs = {args.num_train_epochs}")
|
||||
logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
|
||||
logger.info(f" Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
|
||||
logger.info(f" Total optimization steps = {args.max_train_steps}")
|
||||
|
||||
global_step = 0
|
||||
|
||||
epochs = tqdm(range(args.num_train_epochs), desc=f"Epoch ... (1/{args.num_train_epochs})", position=0)
|
||||
for epoch in epochs:
|
||||
# ======================== Training ================================
|
||||
|
||||
train_metrics = []
|
||||
|
||||
steps_per_epoch = len(train_dataset) // total_train_batch_size
|
||||
train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
|
||||
# train
|
||||
for batch in train_dataloader:
|
||||
batch = shard(batch)
|
||||
state, train_metric, train_rngs = p_train_step(state, vae_params, unet_params, batch, train_rngs)
|
||||
train_metrics.append(train_metric)
|
||||
|
||||
train_step_progress_bar.update(1)
|
||||
global_step += 1
|
||||
|
||||
if global_step >= args.max_train_steps:
|
||||
break
|
||||
|
||||
train_metric = jax_utils.unreplicate(train_metric)
|
||||
|
||||
train_step_progress_bar.close()
|
||||
epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
|
||||
|
||||
# Create the pipeline using using the trained modules and save it.
|
||||
if jax.process_index() == 0:
|
||||
scheduler = FlaxPNDMScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
|
||||
)
|
||||
safety_checker = FlaxStableDiffusionSafetyChecker.from_pretrained(
|
||||
"CompVis/stable-diffusion-safety-checker", from_pt=True
|
||||
)
|
||||
pipeline = FlaxStableDiffusionPipeline(
|
||||
text_encoder=text_encoder,
|
||||
vae=vae,
|
||||
unet=unet,
|
||||
tokenizer=tokenizer,
|
||||
scheduler=scheduler,
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
|
||||
)
|
||||
|
||||
pipeline.save_pretrained(
|
||||
args.output_dir,
|
||||
params={
|
||||
"text_encoder": get_params_to_save(state.params),
|
||||
"vae": get_params_to_save(vae_params),
|
||||
"unet": get_params_to_save(unet_params),
|
||||
"safety_checker": safety_checker.params,
|
||||
},
|
||||
)
|
||||
|
||||
# Also save the newly trained embeddings
|
||||
learned_embeds = get_params_to_save(state.params)["text_model"]["embeddings"]["token_embedding"]["embedding"][
|
||||
placeholder_token_id
|
||||
]
|
||||
learned_embeds_dict = {args.placeholder_token: learned_embeds}
|
||||
jnp.save(os.path.join(args.output_dir, "learned_embeds.npy"), learned_embeds_dict)
|
||||
|
||||
if args.push_to_hub:
|
||||
repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -7,7 +7,7 @@ Creating a training image set is [described in a different document](https://hug
|
||||
Before running the scripts, make sure to install the library's training dependencies:
|
||||
|
||||
```bash
|
||||
pip install diffusers[training] accelerate datasets tensorboard
|
||||
pip install diffusers[training] accelerate datasets
|
||||
```
|
||||
|
||||
And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
import argparse
|
||||
import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@@ -11,9 +9,9 @@ from accelerate import Accelerator
|
||||
from accelerate.logging import get_logger
|
||||
from datasets import load_dataset
|
||||
from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
|
||||
from diffusers.hub_utils import init_git_repo
|
||||
from diffusers.optimization import get_scheduler
|
||||
from diffusers.training_utils import EMAModel
|
||||
from huggingface_hub import HfFolder, Repository, whoami
|
||||
from torchvision.transforms import (
|
||||
CenterCrop,
|
||||
Compose,
|
||||
@@ -29,197 +27,6 @@ from tqdm.auto import tqdm
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def _extract_into_tensor(arr, timesteps, broadcast_shape):
|
||||
"""
|
||||
Extract values from a 1-D numpy array for a batch of indices.
|
||||
|
||||
:param arr: the 1-D numpy array.
|
||||
:param timesteps: a tensor of indices into the array to extract.
|
||||
:param broadcast_shape: a larger shape of K dimensions with the batch
|
||||
dimension equal to the length of timesteps.
|
||||
:return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
|
||||
"""
|
||||
if not isinstance(arr, torch.Tensor):
|
||||
arr = torch.from_numpy(arr)
|
||||
res = arr[timesteps].float().to(timesteps.device)
|
||||
while len(res.shape) < len(broadcast_shape):
|
||||
res = res[..., None]
|
||||
return res.expand(broadcast_shape)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
||||
parser.add_argument(
|
||||
"--dataset_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
|
||||
" dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
|
||||
" or to a folder containing files that HF Datasets can understand."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_config_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The config of the Dataset, leave as None if there's only one config.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_data_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"A folder containing the training data. Folder contents must follow the structure described in"
|
||||
" https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
|
||||
" must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default="ddpm-model-64",
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument("--overwrite_output_dir", action="store_true")
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The directory where the downloaded models and datasets will be stored.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resolution",
|
||||
type=int,
|
||||
default=64,
|
||||
help=(
|
||||
"The resolution for input images, all the images in the train/validation dataset will be resized to this"
|
||||
" resolution"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--eval_batch_size", type=int, default=16, help="The number of images to generate for evaluation."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataloader_num_workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help=(
|
||||
"The number of subprocesses to use for data loading. 0 means that the data will be loaded in the main"
|
||||
" process."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--num_epochs", type=int, default=100)
|
||||
parser.add_argument("--save_images_epochs", type=int, default=10, help="How often to save images during training.")
|
||||
parser.add_argument(
|
||||
"--save_model_epochs", type=int, default=10, help="How often to save the model during training."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
default=1e-4,
|
||||
help="Initial learning rate (after the potential warmup period) to use.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lr_scheduler",
|
||||
type=str,
|
||||
default="cosine",
|
||||
help=(
|
||||
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
|
||||
' "constant", "constant_with_warmup"]'
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
|
||||
)
|
||||
parser.add_argument("--adam_beta1", type=float, default=0.95, help="The beta1 parameter for the Adam optimizer.")
|
||||
parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
|
||||
parser.add_argument(
|
||||
"--adam_weight_decay", type=float, default=1e-6, help="Weight decay magnitude for the Adam optimizer."
|
||||
)
|
||||
parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer.")
|
||||
parser.add_argument(
|
||||
"--use_ema",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Whether to use Exponential Moving Average for the final model weights.",
|
||||
)
|
||||
parser.add_argument("--ema_inv_gamma", type=float, default=1.0, help="The inverse gamma value for the EMA decay.")
|
||||
parser.add_argument("--ema_power", type=float, default=3 / 4, help="The power value for the EMA decay.")
|
||||
parser.add_argument("--ema_max_decay", type=float, default=0.9999, help="The maximum decay magnitude for EMA.")
|
||||
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
|
||||
parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--hub_model_id",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The name of the repository to keep in sync with the local `output_dir`.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hub_private_repo", action="store_true", help="Whether or not to create a private repository."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help=(
|
||||
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
|
||||
" *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help=(
|
||||
"Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU."
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--predict_mode",
|
||||
type=str,
|
||||
default="eps",
|
||||
help="What the model should predict. 'eps' to predict error, 'x0' to directly predict reconstruction",
|
||||
)
|
||||
|
||||
parser.add_argument("--ddpm_num_steps", type=int, default=1000)
|
||||
parser.add_argument("--ddpm_beta_schedule", type=str, default="linear")
|
||||
|
||||
args = parser.parse_args()
|
||||
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
|
||||
if env_local_rank != -1 and env_local_rank != args.local_rank:
|
||||
args.local_rank = env_local_rank
|
||||
|
||||
if args.dataset_name is None and args.train_data_dir is None:
|
||||
raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
|
||||
if token is None:
|
||||
token = HfFolder.get_token()
|
||||
if organization is None:
|
||||
username = whoami(token)["name"]
|
||||
return f"{username}/{model_id}"
|
||||
else:
|
||||
return f"{organization}/{model_id}"
|
||||
|
||||
|
||||
def main(args):
|
||||
logging_dir = os.path.join(args.output_dir, args.logging_dir)
|
||||
accelerator = Accelerator(
|
||||
@@ -252,7 +59,7 @@ def main(args):
|
||||
"UpBlock2D",
|
||||
),
|
||||
)
|
||||
noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule)
|
||||
noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
|
||||
optimizer = torch.optim.AdamW(
|
||||
model.parameters(),
|
||||
lr=args.learning_rate,
|
||||
@@ -285,12 +92,8 @@ def main(args):
|
||||
images = [augmentations(image.convert("RGB")) for image in examples["image"]]
|
||||
return {"input": images}
|
||||
|
||||
logger.info(f"Dataset size: {len(dataset)}")
|
||||
|
||||
dataset.set_transform(transforms)
|
||||
train_dataloader = torch.utils.data.DataLoader(
|
||||
dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
|
||||
)
|
||||
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=True)
|
||||
|
||||
lr_scheduler = get_scheduler(
|
||||
args.lr_scheduler,
|
||||
@@ -307,22 +110,8 @@ def main(args):
|
||||
|
||||
ema_model = EMAModel(model, inv_gamma=args.ema_inv_gamma, power=args.ema_power, max_value=args.ema_max_decay)
|
||||
|
||||
# Handle the repository creation
|
||||
if accelerator.is_main_process:
|
||||
if args.push_to_hub:
|
||||
if args.hub_model_id is None:
|
||||
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
repo = Repository(args.output_dir, clone_from=repo_name)
|
||||
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
|
||||
if "step_*" not in gitignore:
|
||||
gitignore.write("step_*\n")
|
||||
if "epoch_*" not in gitignore:
|
||||
gitignore.write("epoch_*\n")
|
||||
elif args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if args.push_to_hub:
|
||||
repo = init_git_repo(args, at_init=True)
|
||||
|
||||
if accelerator.is_main_process:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
@@ -349,20 +138,8 @@ def main(args):
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
# Predict the noise residual
|
||||
model_output = model(noisy_images, timesteps).sample
|
||||
|
||||
if args.predict_mode == "eps":
|
||||
loss = F.mse_loss(model_output, noise) # this could have different weights!
|
||||
elif args.predict_mode == "x0":
|
||||
alpha_t = _extract_into_tensor(
|
||||
noise_scheduler.alphas_cumprod, timesteps, (clean_images.shape[0], 1, 1, 1)
|
||||
)
|
||||
snr_weights = alpha_t / (1 - alpha_t)
|
||||
loss = snr_weights * F.mse_loss(
|
||||
model_output, clean_images, reduction="none"
|
||||
) # use SNR weighting from distillation paper
|
||||
loss = loss.mean()
|
||||
|
||||
noise_pred = model(noisy_images, timesteps).sample
|
||||
loss = F.mse_loss(noise_pred, noise)
|
||||
accelerator.backward(loss)
|
||||
|
||||
if accelerator.sync_gradients:
|
||||
@@ -397,12 +174,7 @@ def main(args):
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
# run pipeline in inference (sample random noise and denoise)
|
||||
images = pipeline(
|
||||
generator=generator,
|
||||
batch_size=args.eval_batch_size,
|
||||
output_type="numpy",
|
||||
predict_epsilon=args.predict_mode == "eps",
|
||||
).images
|
||||
images = pipeline(generator=generator, batch_size=args.eval_batch_size, output_type="numpy").images
|
||||
|
||||
# denormalize the images and save to tensorboard
|
||||
images_processed = (images * 255).round().astype("uint8")
|
||||
@@ -412,14 +184,65 @@ def main(args):
|
||||
|
||||
if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
|
||||
# save the model
|
||||
pipeline.save_pretrained(args.output_dir)
|
||||
if args.push_to_hub:
|
||||
repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=False)
|
||||
else:
|
||||
pipeline.save_pretrained(args.output_dir)
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
accelerator.end_training()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
||||
parser.add_argument("--local_rank", type=int, default=-1)
|
||||
parser.add_argument("--dataset_name", type=str, default=None)
|
||||
parser.add_argument("--dataset_config_name", type=str, default=None)
|
||||
parser.add_argument("--train_data_dir", type=str, default=None, help="A folder containing the training data.")
|
||||
parser.add_argument("--output_dir", type=str, default="ddpm-model-64")
|
||||
parser.add_argument("--overwrite_output_dir", action="store_true")
|
||||
parser.add_argument("--cache_dir", type=str, default=None)
|
||||
parser.add_argument("--resolution", type=int, default=64)
|
||||
parser.add_argument("--train_batch_size", type=int, default=16)
|
||||
parser.add_argument("--eval_batch_size", type=int, default=16)
|
||||
parser.add_argument("--num_epochs", type=int, default=100)
|
||||
parser.add_argument("--save_images_epochs", type=int, default=10)
|
||||
parser.add_argument("--save_model_epochs", type=int, default=10)
|
||||
parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
|
||||
parser.add_argument("--learning_rate", type=float, default=1e-4)
|
||||
parser.add_argument("--lr_scheduler", type=str, default="cosine")
|
||||
parser.add_argument("--lr_warmup_steps", type=int, default=500)
|
||||
parser.add_argument("--adam_beta1", type=float, default=0.95)
|
||||
parser.add_argument("--adam_beta2", type=float, default=0.999)
|
||||
parser.add_argument("--adam_weight_decay", type=float, default=1e-6)
|
||||
parser.add_argument("--adam_epsilon", type=float, default=1e-08)
|
||||
parser.add_argument("--use_ema", action="store_true", default=True)
|
||||
parser.add_argument("--ema_inv_gamma", type=float, default=1.0)
|
||||
parser.add_argument("--ema_power", type=float, default=3 / 4)
|
||||
parser.add_argument("--ema_max_decay", type=float, default=0.9999)
|
||||
parser.add_argument("--push_to_hub", action="store_true")
|
||||
parser.add_argument("--hub_token", type=str, default=None)
|
||||
parser.add_argument("--hub_model_id", type=str, default=None)
|
||||
parser.add_argument("--hub_private_repo", action="store_true")
|
||||
parser.add_argument("--logging_dir", type=str, default="logs")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help=(
|
||||
"Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU."
|
||||
),
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
|
||||
if env_local_rank != -1 and env_local_rank != args.local_rank:
|
||||
args.local_rank = env_local_rank
|
||||
|
||||
if args.dataset_name is None and args.train_data_dir is None:
|
||||
raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
|
||||
|
||||
main(args)
|
||||
|
||||
@@ -1,339 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import math
|
||||
import os
|
||||
from copy import deepcopy
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from audio_diffusion.models import DiffusionAttnUnet1D
|
||||
from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
|
||||
from diffusion import sampling
|
||||
|
||||
|
||||
MODELS_MAP = {
|
||||
"gwf-440k": {
|
||||
"url": "https://model-server.zqevans2.workers.dev/gwf-440k.ckpt",
|
||||
"sample_rate": 48000,
|
||||
"sample_size": 65536,
|
||||
},
|
||||
"jmann-small-190k": {
|
||||
"url": "https://model-server.zqevans2.workers.dev/jmann-small-190k.ckpt",
|
||||
"sample_rate": 48000,
|
||||
"sample_size": 65536,
|
||||
},
|
||||
"jmann-large-580k": {
|
||||
"url": "https://model-server.zqevans2.workers.dev/jmann-large-580k.ckpt",
|
||||
"sample_rate": 48000,
|
||||
"sample_size": 131072,
|
||||
},
|
||||
"maestro-uncond-150k": {
|
||||
"url": "https://model-server.zqevans2.workers.dev/maestro-uncond-150k.ckpt",
|
||||
"sample_rate": 16000,
|
||||
"sample_size": 65536,
|
||||
},
|
||||
"unlocked-uncond-250k": {
|
||||
"url": "https://model-server.zqevans2.workers.dev/unlocked-uncond-250k.ckpt",
|
||||
"sample_rate": 16000,
|
||||
"sample_size": 65536,
|
||||
},
|
||||
"honk-140k": {
|
||||
"url": "https://model-server.zqevans2.workers.dev/honk-140k.ckpt",
|
||||
"sample_rate": 16000,
|
||||
"sample_size": 65536,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def alpha_sigma_to_t(alpha, sigma):
|
||||
"""Returns a timestep, given the scaling factors for the clean image and for
|
||||
the noise."""
|
||||
return torch.atan2(sigma, alpha) / math.pi * 2
|
||||
|
||||
|
||||
def get_crash_schedule(t):
|
||||
sigma = torch.sin(t * math.pi / 2) ** 2
|
||||
alpha = (1 - sigma**2) ** 0.5
|
||||
return alpha_sigma_to_t(alpha, sigma)
|
||||
|
||||
|
||||
class Object(object):
|
||||
pass
|
||||
|
||||
|
||||
class DiffusionUncond(nn.Module):
|
||||
def __init__(self, global_args):
|
||||
super().__init__()
|
||||
|
||||
self.diffusion = DiffusionAttnUnet1D(global_args, n_attn_layers=4)
|
||||
self.diffusion_ema = deepcopy(self.diffusion)
|
||||
self.rng = torch.quasirandom.SobolEngine(1, scramble=True)
|
||||
|
||||
|
||||
def download(model_name):
|
||||
url = MODELS_MAP[model_name]["url"]
|
||||
os.system(f"wget {url} ./")
|
||||
|
||||
return f"./{model_name}.ckpt"
|
||||
|
||||
|
||||
DOWN_NUM_TO_LAYER = {
|
||||
"1": "resnets.0",
|
||||
"2": "attentions.0",
|
||||
"3": "resnets.1",
|
||||
"4": "attentions.1",
|
||||
"5": "resnets.2",
|
||||
"6": "attentions.2",
|
||||
}
|
||||
UP_NUM_TO_LAYER = {
|
||||
"8": "resnets.0",
|
||||
"9": "attentions.0",
|
||||
"10": "resnets.1",
|
||||
"11": "attentions.1",
|
||||
"12": "resnets.2",
|
||||
"13": "attentions.2",
|
||||
}
|
||||
MID_NUM_TO_LAYER = {
|
||||
"1": "resnets.0",
|
||||
"2": "attentions.0",
|
||||
"3": "resnets.1",
|
||||
"4": "attentions.1",
|
||||
"5": "resnets.2",
|
||||
"6": "attentions.2",
|
||||
"8": "resnets.3",
|
||||
"9": "attentions.3",
|
||||
"10": "resnets.4",
|
||||
"11": "attentions.4",
|
||||
"12": "resnets.5",
|
||||
"13": "attentions.5",
|
||||
}
|
||||
DEPTH_0_TO_LAYER = {
|
||||
"0": "resnets.0",
|
||||
"1": "resnets.1",
|
||||
"2": "resnets.2",
|
||||
"4": "resnets.0",
|
||||
"5": "resnets.1",
|
||||
"6": "resnets.2",
|
||||
}
|
||||
|
||||
RES_CONV_MAP = {
|
||||
"skip": "conv_skip",
|
||||
"main.0": "conv_1",
|
||||
"main.1": "group_norm_1",
|
||||
"main.3": "conv_2",
|
||||
"main.4": "group_norm_2",
|
||||
}
|
||||
|
||||
ATTN_MAP = {
|
||||
"norm": "group_norm",
|
||||
"qkv_proj": ["query", "key", "value"],
|
||||
"out_proj": ["proj_attn"],
|
||||
}
|
||||
|
||||
|
||||
def convert_resconv_naming(name):
|
||||
if name.startswith("skip"):
|
||||
return name.replace("skip", RES_CONV_MAP["skip"])
|
||||
|
||||
# name has to be of format main.{digit}
|
||||
if not name.startswith("main."):
|
||||
raise ValueError(f"ResConvBlock error with {name}")
|
||||
|
||||
return name.replace(name[:6], RES_CONV_MAP[name[:6]])
|
||||
|
||||
|
||||
def convert_attn_naming(name):
|
||||
for key, value in ATTN_MAP.items():
|
||||
if name.startswith(key) and not isinstance(value, list):
|
||||
return name.replace(key, value)
|
||||
elif name.startswith(key):
|
||||
return [name.replace(key, v) for v in value]
|
||||
raise ValueError(f"Attn error with {name}")
|
||||
|
||||
|
||||
def rename(input_string, max_depth=13):
|
||||
string = input_string
|
||||
|
||||
if string.split(".")[0] == "timestep_embed":
|
||||
return string.replace("timestep_embed", "time_proj")
|
||||
|
||||
depth = 0
|
||||
if string.startswith("net.3."):
|
||||
depth += 1
|
||||
string = string[6:]
|
||||
elif string.startswith("net."):
|
||||
string = string[4:]
|
||||
|
||||
while string.startswith("main.7."):
|
||||
depth += 1
|
||||
string = string[7:]
|
||||
|
||||
if string.startswith("main."):
|
||||
string = string[5:]
|
||||
|
||||
# mid block
|
||||
if string[:2].isdigit():
|
||||
layer_num = string[:2]
|
||||
string_left = string[2:]
|
||||
else:
|
||||
layer_num = string[0]
|
||||
string_left = string[1:]
|
||||
|
||||
if depth == max_depth:
|
||||
new_layer = MID_NUM_TO_LAYER[layer_num]
|
||||
prefix = "mid_block"
|
||||
elif depth > 0 and int(layer_num) < 7:
|
||||
new_layer = DOWN_NUM_TO_LAYER[layer_num]
|
||||
prefix = f"down_blocks.{depth}"
|
||||
elif depth > 0 and int(layer_num) > 7:
|
||||
new_layer = UP_NUM_TO_LAYER[layer_num]
|
||||
prefix = f"up_blocks.{max_depth - depth - 1}"
|
||||
elif depth == 0:
|
||||
new_layer = DEPTH_0_TO_LAYER[layer_num]
|
||||
prefix = f"up_blocks.{max_depth - 1}" if int(layer_num) > 3 else "down_blocks.0"
|
||||
|
||||
if not string_left.startswith("."):
|
||||
raise ValueError(f"Naming error with {input_string} and string_left: {string_left}.")
|
||||
|
||||
string_left = string_left[1:]
|
||||
|
||||
if "resnets" in new_layer:
|
||||
string_left = convert_resconv_naming(string_left)
|
||||
elif "attentions" in new_layer:
|
||||
new_string_left = convert_attn_naming(string_left)
|
||||
string_left = new_string_left
|
||||
|
||||
if not isinstance(string_left, list):
|
||||
new_string = prefix + "." + new_layer + "." + string_left
|
||||
else:
|
||||
new_string = [prefix + "." + new_layer + "." + s for s in string_left]
|
||||
return new_string
|
||||
|
||||
|
||||
def rename_orig_weights(state_dict):
|
||||
new_state_dict = {}
|
||||
for k, v in state_dict.items():
|
||||
if k.endswith("kernel"):
|
||||
# up- and downsample layers, don't have trainable weights
|
||||
continue
|
||||
|
||||
new_k = rename(k)
|
||||
|
||||
# check if we need to transform from Conv => Linear for attention
|
||||
if isinstance(new_k, list):
|
||||
new_state_dict = transform_conv_attns(new_state_dict, new_k, v)
|
||||
else:
|
||||
new_state_dict[new_k] = v
|
||||
|
||||
return new_state_dict
|
||||
|
||||
|
||||
def transform_conv_attns(new_state_dict, new_k, v):
|
||||
if len(new_k) == 1:
|
||||
if len(v.shape) == 3:
|
||||
# weight
|
||||
new_state_dict[new_k[0]] = v[:, :, 0]
|
||||
else:
|
||||
# bias
|
||||
new_state_dict[new_k[0]] = v
|
||||
else:
|
||||
# qkv matrices
|
||||
trippled_shape = v.shape[0]
|
||||
single_shape = trippled_shape // 3
|
||||
for i in range(3):
|
||||
if len(v.shape) == 3:
|
||||
new_state_dict[new_k[i]] = v[i * single_shape : (i + 1) * single_shape, :, 0]
|
||||
else:
|
||||
new_state_dict[new_k[i]] = v[i * single_shape : (i + 1) * single_shape]
|
||||
return new_state_dict
|
||||
|
||||
|
||||
def main(args):
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
model_name = args.model_path.split("/")[-1].split(".")[0]
|
||||
if not os.path.isfile(args.model_path):
|
||||
assert (
|
||||
model_name == args.model_path
|
||||
), f"Make sure to provide one of the official model names {MODELS_MAP.keys()}"
|
||||
args.model_path = download(model_name)
|
||||
|
||||
sample_rate = MODELS_MAP[model_name]["sample_rate"]
|
||||
sample_size = MODELS_MAP[model_name]["sample_size"]
|
||||
|
||||
config = Object()
|
||||
config.sample_size = sample_size
|
||||
config.sample_rate = sample_rate
|
||||
config.latent_dim = 0
|
||||
|
||||
diffusers_model = UNet1DModel(sample_size=sample_size, sample_rate=sample_rate)
|
||||
diffusers_state_dict = diffusers_model.state_dict()
|
||||
|
||||
orig_model = DiffusionUncond(config)
|
||||
orig_model.load_state_dict(torch.load(args.model_path, map_location=device)["state_dict"])
|
||||
orig_model = orig_model.diffusion_ema.eval()
|
||||
orig_model_state_dict = orig_model.state_dict()
|
||||
renamed_state_dict = rename_orig_weights(orig_model_state_dict)
|
||||
|
||||
renamed_minus_diffusers = set(renamed_state_dict.keys()) - set(diffusers_state_dict.keys())
|
||||
diffusers_minus_renamed = set(diffusers_state_dict.keys()) - set(renamed_state_dict.keys())
|
||||
|
||||
assert len(renamed_minus_diffusers) == 0, f"Problem with {renamed_minus_diffusers}"
|
||||
assert all(k.endswith("kernel") for k in list(diffusers_minus_renamed)), f"Problem with {diffusers_minus_renamed}"
|
||||
|
||||
for key, value in renamed_state_dict.items():
|
||||
assert (
|
||||
diffusers_state_dict[key].squeeze().shape == value.squeeze().shape
|
||||
), f"Shape for {key} doesn't match. Diffusers: {diffusers_state_dict[key].shape} vs. {value.shape}"
|
||||
if key == "time_proj.weight":
|
||||
value = value.squeeze()
|
||||
|
||||
diffusers_state_dict[key] = value
|
||||
|
||||
diffusers_model.load_state_dict(diffusers_state_dict)
|
||||
|
||||
steps = 100
|
||||
seed = 33
|
||||
|
||||
diffusers_scheduler = IPNDMScheduler(num_train_timesteps=steps)
|
||||
|
||||
generator = torch.manual_seed(seed)
|
||||
noise = torch.randn([1, 2, config.sample_size], generator=generator).to(device)
|
||||
|
||||
t = torch.linspace(1, 0, steps + 1, device=device)[:-1]
|
||||
step_list = get_crash_schedule(t)
|
||||
|
||||
pipe = DanceDiffusionPipeline(unet=diffusers_model, scheduler=diffusers_scheduler)
|
||||
|
||||
generator = torch.manual_seed(33)
|
||||
audio = pipe(num_inference_steps=steps, generator=generator).audios
|
||||
|
||||
generated = sampling.iplms_sample(orig_model, noise, step_list, {})
|
||||
generated = generated.clamp(-1, 1)
|
||||
|
||||
diff_sum = (generated - audio).abs().sum()
|
||||
diff_max = (generated - audio).abs().max()
|
||||
|
||||
if args.save:
|
||||
pipe.save_pretrained(args.checkpoint_path)
|
||||
|
||||
print("Diff sum", diff_sum)
|
||||
print("Diff max", diff_max)
|
||||
|
||||
assert diff_max < 1e-3, f"Diff max: {diff_max} is too much :-/"
|
||||
|
||||
print(f"Conversion for {model_name} successful!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
|
||||
parser.add_argument(
|
||||
"--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
|
||||
)
|
||||
parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the output model.")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
||||
@@ -285,34 +285,15 @@ def create_ldm_bert_config(original_config):
|
||||
return config
|
||||
|
||||
|
||||
def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
|
||||
def convert_ldm_unet_checkpoint(checkpoint, config):
|
||||
"""
|
||||
Takes a state dict and a config, and returns a converted checkpoint.
|
||||
"""
|
||||
|
||||
# extract state_dict for UNet
|
||||
unet_state_dict = {}
|
||||
keys = list(checkpoint.keys())
|
||||
|
||||
unet_key = "model.diffusion_model."
|
||||
# at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
|
||||
if sum(k.startswith("model_ema") for k in keys) > 100:
|
||||
print(f"Checkpoint {path} has both EMA and non-EMA weights.")
|
||||
if extract_ema:
|
||||
print(
|
||||
"In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
|
||||
" weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
|
||||
)
|
||||
for key in keys:
|
||||
if key.startswith("model.diffusion_model"):
|
||||
flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
|
||||
unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
|
||||
else:
|
||||
print(
|
||||
"In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
|
||||
" weights (usually better for inference), please make sure to add the `--extract_ema` flag."
|
||||
)
|
||||
|
||||
keys = list(checkpoint.keys())
|
||||
for key in keys:
|
||||
if key.startswith(unet_key):
|
||||
unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
|
||||
@@ -649,15 +630,6 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--extract_ema",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
|
||||
" or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
|
||||
" higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -669,9 +641,7 @@ if __name__ == "__main__":
|
||||
args.original_config_file = "./v1-inference.yaml"
|
||||
|
||||
original_config = OmegaConf.load(args.original_config_file)
|
||||
|
||||
checkpoint = torch.load(args.checkpoint_path)
|
||||
checkpoint = checkpoint["state_dict"]
|
||||
checkpoint = torch.load(args.checkpoint_path)["state_dict"]
|
||||
|
||||
num_train_timesteps = original_config.model.params.timesteps
|
||||
beta_start = original_config.model.params.linear_start
|
||||
@@ -699,9 +669,7 @@ if __name__ == "__main__":
|
||||
|
||||
# Convert the UNet2DConditionModel model.
|
||||
unet_config = create_unet_diffusers_config(original_config)
|
||||
converted_unet_checkpoint = convert_ldm_unet_checkpoint(
|
||||
checkpoint, unet_config, path=args.checkpoint_path, extract_ema=args.extract_ema
|
||||
)
|
||||
converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config)
|
||||
|
||||
unet = UNet2DConditionModel(**unet_config)
|
||||
unet.load_state_dict(converted_unet_checkpoint)
|
||||
|
||||
@@ -21,7 +21,7 @@ import torch
|
||||
from torch.onnx import export
|
||||
|
||||
import onnx
|
||||
from diffusers import OnnxStableDiffusionPipeline, StableDiffusionPipeline
|
||||
from diffusers import StableDiffusionOnnxPipeline, StableDiffusionPipeline
|
||||
from diffusers.onnx_utils import OnnxRuntimeModel
|
||||
from packaging import version
|
||||
|
||||
@@ -69,15 +69,8 @@ def onnx_export(
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = False):
|
||||
dtype = torch.float16 if fp16 else torch.float32
|
||||
if fp16 and torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
elif fp16 and not torch.cuda.is_available():
|
||||
raise ValueError("`float16` model export is only supported on GPUs with CUDA")
|
||||
else:
|
||||
device = "cpu"
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=dtype).to(device)
|
||||
def convert_models(model_path: str, output_path: str, opset: int):
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(model_path)
|
||||
output_path = Path(output_path)
|
||||
|
||||
# TEXT ENCODER
|
||||
@@ -91,7 +84,7 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
|
||||
onnx_export(
|
||||
pipeline.text_encoder,
|
||||
# casting to torch.int32 until the CLIP fix is released: https://github.com/huggingface/transformers/pull/18515/files
|
||||
model_args=(text_input.input_ids.to(device=device, dtype=torch.int32)),
|
||||
model_args=(text_input.input_ids.to(torch.int32)),
|
||||
output_path=output_path / "text_encoder" / "model.onnx",
|
||||
ordered_input_names=["input_ids"],
|
||||
output_names=["last_hidden_state", "pooler_output"],
|
||||
@@ -100,18 +93,12 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
|
||||
},
|
||||
opset=opset,
|
||||
)
|
||||
del pipeline.text_encoder
|
||||
|
||||
# UNET
|
||||
unet_path = output_path / "unet" / "model.onnx"
|
||||
onnx_export(
|
||||
pipeline.unet,
|
||||
model_args=(
|
||||
torch.randn(2, pipeline.unet.in_channels, 64, 64).to(device=device, dtype=dtype),
|
||||
torch.LongTensor([0, 1]).to(device=device),
|
||||
torch.randn(2, 77, 768).to(device=device, dtype=dtype),
|
||||
False,
|
||||
),
|
||||
model_args=(torch.randn(2, 4, 64, 64), torch.LongTensor([0, 1]), torch.randn(2, 77, 768), False),
|
||||
output_path=unet_path,
|
||||
ordered_input_names=["sample", "timestep", "encoder_hidden_states", "return_dict"],
|
||||
output_names=["out_sample"], # has to be different from "sample" for correct tracing
|
||||
@@ -138,7 +125,6 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
|
||||
location="weights.pb",
|
||||
convert_attribute=False,
|
||||
)
|
||||
del pipeline.unet
|
||||
|
||||
# VAE ENCODER
|
||||
vae_encoder = pipeline.vae
|
||||
@@ -146,7 +132,7 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
|
||||
vae_encoder.forward = lambda sample, return_dict: vae_encoder.encode(sample, return_dict)[0].sample()
|
||||
onnx_export(
|
||||
vae_encoder,
|
||||
model_args=(torch.randn(1, 3, 512, 512).to(device=device, dtype=dtype), False),
|
||||
model_args=(torch.randn(1, 3, 512, 512), False),
|
||||
output_path=output_path / "vae_encoder" / "model.onnx",
|
||||
ordered_input_names=["sample", "return_dict"],
|
||||
output_names=["latent_sample"],
|
||||
@@ -162,7 +148,7 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
|
||||
vae_decoder.forward = vae_encoder.decode
|
||||
onnx_export(
|
||||
vae_decoder,
|
||||
model_args=(torch.randn(1, 4, 64, 64).to(device=device, dtype=dtype), False),
|
||||
model_args=(torch.randn(1, 4, 64, 64), False),
|
||||
output_path=output_path / "vae_decoder" / "model.onnx",
|
||||
ordered_input_names=["latent_sample", "return_dict"],
|
||||
output_names=["sample"],
|
||||
@@ -171,30 +157,24 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
|
||||
},
|
||||
opset=opset,
|
||||
)
|
||||
del pipeline.vae
|
||||
|
||||
# SAFETY CHECKER
|
||||
safety_checker = pipeline.safety_checker
|
||||
safety_checker.forward = safety_checker.forward_onnx
|
||||
onnx_export(
|
||||
pipeline.safety_checker,
|
||||
model_args=(
|
||||
torch.randn(1, 3, 224, 224).to(device=device, dtype=dtype),
|
||||
torch.randn(1, 512, 512, 3).to(device=device, dtype=dtype),
|
||||
),
|
||||
model_args=(torch.randn(1, 3, 224, 224), torch.randn(1, 512, 512, 3)),
|
||||
output_path=output_path / "safety_checker" / "model.onnx",
|
||||
ordered_input_names=["clip_input", "images"],
|
||||
output_names=["out_images", "has_nsfw_concepts"],
|
||||
dynamic_axes={
|
||||
"clip_input": {0: "batch", 1: "channels", 2: "height", 3: "width"},
|
||||
"images": {0: "batch", 1: "height", 2: "width", 3: "channels"},
|
||||
"images": {0: "batch", 1: "channels", 2: "height", 3: "width"},
|
||||
},
|
||||
opset=opset,
|
||||
)
|
||||
del pipeline.safety_checker
|
||||
|
||||
onnx_pipeline = OnnxStableDiffusionPipeline(
|
||||
vae_encoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_encoder"),
|
||||
onnx_pipeline = StableDiffusionOnnxPipeline(
|
||||
vae_decoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_decoder"),
|
||||
text_encoder=OnnxRuntimeModel.from_pretrained(output_path / "text_encoder"),
|
||||
tokenizer=pipeline.tokenizer,
|
||||
@@ -207,9 +187,7 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
|
||||
onnx_pipeline.save_pretrained(output_path)
|
||||
print("ONNX pipeline saved to", output_path)
|
||||
|
||||
del pipeline
|
||||
del onnx_pipeline
|
||||
_ = OnnxStableDiffusionPipeline.from_pretrained(output_path, provider="CPUExecutionProvider")
|
||||
_ = StableDiffusionOnnxPipeline.from_pretrained(output_path, provider="CPUExecutionProvider")
|
||||
print("ONNX pipeline is loadable")
|
||||
|
||||
|
||||
@@ -231,8 +209,7 @@ if __name__ == "__main__":
|
||||
type=int,
|
||||
help="The version of the ONNX operator set to use.",
|
||||
)
|
||||
parser.add_argument("--fp16", action="store_true", default=False, help="Export the models in `float16` mode")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
convert_models(args.model_path, args.output_path, args.opset, args.fp16)
|
||||
convert_models(args.model_path, args.output_path, args.opset)
|
||||
|
||||
@@ -1,885 +0,0 @@
|
||||
"""
|
||||
This script ports models from VQ-diffusion (https://github.com/microsoft/VQ-Diffusion) to diffusers.
|
||||
|
||||
It currently only supports porting the ITHQ dataset.
|
||||
|
||||
ITHQ dataset:
|
||||
```sh
|
||||
# From the root directory of diffusers.
|
||||
|
||||
# Download the VQVAE checkpoint
|
||||
$ wget https://facevcstandard.blob.core.windows.net/v-zhictang/Improved-VQ-Diffusion_model_release/ithq_vqvae.pth?sv=2020-10-02&st=2022-05-30T15%3A17%3A18Z&se=2030-05-31T15%3A17%3A00Z&sr=b&sp=r&sig=1jVavHFPpUjDs%2FTO1V3PTezaNbPp2Nx8MxiWI7y6fEY%3D -O ithq_vqvae.pth
|
||||
|
||||
# Download the VQVAE config
|
||||
# NOTE that in VQ-diffusion the documented file is `configs/ithq.yaml` but the target class
|
||||
# `image_synthesis.modeling.codecs.image_codec.ema_vqvae.PatchVQVAE`
|
||||
# loads `OUTPUT/pretrained_model/taming_dvae/config.yaml`
|
||||
$ wget https://raw.githubusercontent.com/microsoft/VQ-Diffusion/main/OUTPUT/pretrained_model/taming_dvae/config.yaml -O ithq_vqvae.yaml
|
||||
|
||||
# Download the main model checkpoint
|
||||
$ wget https://facevcstandard.blob.core.windows.net/v-zhictang/Improved-VQ-Diffusion_model_release/ithq_learnable.pth?sv=2020-10-02&st=2022-05-30T10%3A22%3A06Z&se=2030-05-31T10%3A22%3A00Z&sr=b&sp=r&sig=GOE%2Bza02%2FPnGxYVOOPtwrTR4RA3%2F5NVgMxdW4kjaEZ8%3D -O ithq_learnable.pth
|
||||
|
||||
# Download the main model config
|
||||
$ wget https://raw.githubusercontent.com/microsoft/VQ-Diffusion/main/configs/ithq.yaml -O ithq.yaml
|
||||
|
||||
# run the convert script
|
||||
$ python ./scripts/convert_vq_diffusion_to_diffusers.py \
|
||||
--checkpoint_path ./ithq_learnable.pth \
|
||||
--original_config_file ./ithq.yaml \
|
||||
--vqvae_checkpoint_path ./ithq_vqvae.pth \
|
||||
--vqvae_original_config_file ./ithq_vqvae.yaml \
|
||||
--dump_path <path to save pre-trained `VQDiffusionPipeline`>
|
||||
```
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import tempfile
|
||||
|
||||
import torch
|
||||
|
||||
import yaml
|
||||
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
|
||||
from diffusers import VQDiffusionPipeline, VQDiffusionScheduler, VQModel
|
||||
from diffusers.models.attention import Transformer2DModel
|
||||
from transformers import CLIPTextModel, CLIPTokenizer
|
||||
from yaml.loader import FullLoader
|
||||
|
||||
|
||||
try:
|
||||
from omegaconf import OmegaConf
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"OmegaConf is required to convert the VQ Diffusion checkpoints. Please install it with `pip install"
|
||||
" OmegaConf`."
|
||||
)
|
||||
|
||||
# vqvae model
|
||||
|
||||
PORTED_VQVAES = ["image_synthesis.modeling.codecs.image_codec.patch_vqgan.PatchVQGAN"]
|
||||
|
||||
|
||||
def vqvae_model_from_original_config(original_config):
|
||||
assert original_config.target in PORTED_VQVAES, f"{original_config.target} has not yet been ported to diffusers."
|
||||
|
||||
original_config = original_config.params
|
||||
|
||||
original_encoder_config = original_config.encoder_config.params
|
||||
original_decoder_config = original_config.decoder_config.params
|
||||
|
||||
in_channels = original_encoder_config.in_channels
|
||||
out_channels = original_decoder_config.out_ch
|
||||
|
||||
down_block_types = get_down_block_types(original_encoder_config)
|
||||
up_block_types = get_up_block_types(original_decoder_config)
|
||||
|
||||
assert original_encoder_config.ch == original_decoder_config.ch
|
||||
assert original_encoder_config.ch_mult == original_decoder_config.ch_mult
|
||||
block_out_channels = tuple(
|
||||
[original_encoder_config.ch * a_ch_mult for a_ch_mult in original_encoder_config.ch_mult]
|
||||
)
|
||||
|
||||
assert original_encoder_config.num_res_blocks == original_decoder_config.num_res_blocks
|
||||
layers_per_block = original_encoder_config.num_res_blocks
|
||||
|
||||
assert original_encoder_config.z_channels == original_decoder_config.z_channels
|
||||
latent_channels = original_encoder_config.z_channels
|
||||
|
||||
num_vq_embeddings = original_config.n_embed
|
||||
|
||||
# Hard coded value for ResnetBlock.GoupNorm(num_groups) in VQ-diffusion
|
||||
norm_num_groups = 32
|
||||
|
||||
e_dim = original_config.embed_dim
|
||||
|
||||
model = VQModel(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
down_block_types=down_block_types,
|
||||
up_block_types=up_block_types,
|
||||
block_out_channels=block_out_channels,
|
||||
layers_per_block=layers_per_block,
|
||||
latent_channels=latent_channels,
|
||||
num_vq_embeddings=num_vq_embeddings,
|
||||
norm_num_groups=norm_num_groups,
|
||||
vq_embed_dim=e_dim,
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def get_down_block_types(original_encoder_config):
|
||||
attn_resolutions = coerce_attn_resolutions(original_encoder_config.attn_resolutions)
|
||||
num_resolutions = len(original_encoder_config.ch_mult)
|
||||
resolution = coerce_resolution(original_encoder_config.resolution)
|
||||
|
||||
curr_res = resolution
|
||||
down_block_types = []
|
||||
|
||||
for _ in range(num_resolutions):
|
||||
if curr_res in attn_resolutions:
|
||||
down_block_type = "AttnDownEncoderBlock2D"
|
||||
else:
|
||||
down_block_type = "DownEncoderBlock2D"
|
||||
|
||||
down_block_types.append(down_block_type)
|
||||
|
||||
curr_res = [r // 2 for r in curr_res]
|
||||
|
||||
return down_block_types
|
||||
|
||||
|
||||
def get_up_block_types(original_decoder_config):
|
||||
attn_resolutions = coerce_attn_resolutions(original_decoder_config.attn_resolutions)
|
||||
num_resolutions = len(original_decoder_config.ch_mult)
|
||||
resolution = coerce_resolution(original_decoder_config.resolution)
|
||||
|
||||
curr_res = [r // 2 ** (num_resolutions - 1) for r in resolution]
|
||||
up_block_types = []
|
||||
|
||||
for _ in reversed(range(num_resolutions)):
|
||||
if curr_res in attn_resolutions:
|
||||
up_block_type = "AttnUpDecoderBlock2D"
|
||||
else:
|
||||
up_block_type = "UpDecoderBlock2D"
|
||||
|
||||
up_block_types.append(up_block_type)
|
||||
|
||||
curr_res = [r * 2 for r in curr_res]
|
||||
|
||||
return up_block_types
|
||||
|
||||
|
||||
def coerce_attn_resolutions(attn_resolutions):
|
||||
attn_resolutions = OmegaConf.to_object(attn_resolutions)
|
||||
attn_resolutions_ = []
|
||||
for ar in attn_resolutions:
|
||||
if isinstance(ar, (list, tuple)):
|
||||
attn_resolutions_.append(list(ar))
|
||||
else:
|
||||
attn_resolutions_.append([ar, ar])
|
||||
return attn_resolutions_
|
||||
|
||||
|
||||
def coerce_resolution(resolution):
|
||||
resolution = OmegaConf.to_object(resolution)
|
||||
if isinstance(resolution, int):
|
||||
resolution = [resolution, resolution] # H, W
|
||||
elif isinstance(resolution, (tuple, list)):
|
||||
resolution = list(resolution)
|
||||
else:
|
||||
raise ValueError("Unknown type of resolution:", resolution)
|
||||
return resolution
|
||||
|
||||
|
||||
# done vqvae model
|
||||
|
||||
# vqvae checkpoint
|
||||
|
||||
|
||||
def vqvae_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
|
||||
diffusers_checkpoint = {}
|
||||
|
||||
diffusers_checkpoint.update(vqvae_encoder_to_diffusers_checkpoint(model, checkpoint))
|
||||
|
||||
# quant_conv
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
"quant_conv.weight": checkpoint["quant_conv.weight"],
|
||||
"quant_conv.bias": checkpoint["quant_conv.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
# quantize
|
||||
diffusers_checkpoint.update({"quantize.embedding.weight": checkpoint["quantize.embedding"]})
|
||||
|
||||
# post_quant_conv
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
"post_quant_conv.weight": checkpoint["post_quant_conv.weight"],
|
||||
"post_quant_conv.bias": checkpoint["post_quant_conv.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
# decoder
|
||||
diffusers_checkpoint.update(vqvae_decoder_to_diffusers_checkpoint(model, checkpoint))
|
||||
|
||||
return diffusers_checkpoint
|
||||
|
||||
|
||||
def vqvae_encoder_to_diffusers_checkpoint(model, checkpoint):
|
||||
diffusers_checkpoint = {}
|
||||
|
||||
# conv_in
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
"encoder.conv_in.weight": checkpoint["encoder.conv_in.weight"],
|
||||
"encoder.conv_in.bias": checkpoint["encoder.conv_in.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
# down_blocks
|
||||
for down_block_idx, down_block in enumerate(model.encoder.down_blocks):
|
||||
diffusers_down_block_prefix = f"encoder.down_blocks.{down_block_idx}"
|
||||
down_block_prefix = f"encoder.down.{down_block_idx}"
|
||||
|
||||
# resnets
|
||||
for resnet_idx, resnet in enumerate(down_block.resnets):
|
||||
diffusers_resnet_prefix = f"{diffusers_down_block_prefix}.resnets.{resnet_idx}"
|
||||
resnet_prefix = f"{down_block_prefix}.block.{resnet_idx}"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
vqvae_resnet_to_diffusers_checkpoint(
|
||||
resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
|
||||
)
|
||||
)
|
||||
|
||||
# downsample
|
||||
|
||||
# do not include the downsample when on the last down block
|
||||
# There is no downsample on the last down block
|
||||
if down_block_idx != len(model.encoder.down_blocks) - 1:
|
||||
# There's a single downsample in the original checkpoint but a list of downsamples
|
||||
# in the diffusers model.
|
||||
diffusers_downsample_prefix = f"{diffusers_down_block_prefix}.downsamplers.0.conv"
|
||||
downsample_prefix = f"{down_block_prefix}.downsample.conv"
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
|
||||
f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
# attentions
|
||||
|
||||
if hasattr(down_block, "attentions"):
|
||||
for attention_idx, _ in enumerate(down_block.attentions):
|
||||
diffusers_attention_prefix = f"{diffusers_down_block_prefix}.attentions.{attention_idx}"
|
||||
attention_prefix = f"{down_block_prefix}.attn.{attention_idx}"
|
||||
diffusers_checkpoint.update(
|
||||
vqvae_attention_to_diffusers_checkpoint(
|
||||
checkpoint,
|
||||
diffusers_attention_prefix=diffusers_attention_prefix,
|
||||
attention_prefix=attention_prefix,
|
||||
)
|
||||
)
|
||||
|
||||
# mid block
|
||||
|
||||
# mid block attentions
|
||||
|
||||
# There is a single hardcoded attention block in the middle of the VQ-diffusion encoder
|
||||
diffusers_attention_prefix = "encoder.mid_block.attentions.0"
|
||||
attention_prefix = "encoder.mid.attn_1"
|
||||
diffusers_checkpoint.update(
|
||||
vqvae_attention_to_diffusers_checkpoint(
|
||||
checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
|
||||
)
|
||||
)
|
||||
|
||||
# mid block resnets
|
||||
|
||||
for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
|
||||
diffusers_resnet_prefix = f"encoder.mid_block.resnets.{diffusers_resnet_idx}"
|
||||
|
||||
# the hardcoded prefixes to `block_` are 1 and 2
|
||||
orig_resnet_idx = diffusers_resnet_idx + 1
|
||||
# There are two hardcoded resnets in the middle of the VQ-diffusion encoder
|
||||
resnet_prefix = f"encoder.mid.block_{orig_resnet_idx}"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
vqvae_resnet_to_diffusers_checkpoint(
|
||||
resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
|
||||
)
|
||||
)
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
# conv_norm_out
|
||||
"encoder.conv_norm_out.weight": checkpoint["encoder.norm_out.weight"],
|
||||
"encoder.conv_norm_out.bias": checkpoint["encoder.norm_out.bias"],
|
||||
# conv_out
|
||||
"encoder.conv_out.weight": checkpoint["encoder.conv_out.weight"],
|
||||
"encoder.conv_out.bias": checkpoint["encoder.conv_out.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
return diffusers_checkpoint
|
||||
|
||||
|
||||
def vqvae_decoder_to_diffusers_checkpoint(model, checkpoint):
|
||||
diffusers_checkpoint = {}
|
||||
|
||||
# conv in
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
"decoder.conv_in.weight": checkpoint["decoder.conv_in.weight"],
|
||||
"decoder.conv_in.bias": checkpoint["decoder.conv_in.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
# up_blocks
|
||||
|
||||
for diffusers_up_block_idx, up_block in enumerate(model.decoder.up_blocks):
|
||||
# up_blocks are stored in reverse order in the VQ-diffusion checkpoint
|
||||
orig_up_block_idx = len(model.decoder.up_blocks) - 1 - diffusers_up_block_idx
|
||||
|
||||
diffusers_up_block_prefix = f"decoder.up_blocks.{diffusers_up_block_idx}"
|
||||
up_block_prefix = f"decoder.up.{orig_up_block_idx}"
|
||||
|
||||
# resnets
|
||||
for resnet_idx, resnet in enumerate(up_block.resnets):
|
||||
diffusers_resnet_prefix = f"{diffusers_up_block_prefix}.resnets.{resnet_idx}"
|
||||
resnet_prefix = f"{up_block_prefix}.block.{resnet_idx}"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
vqvae_resnet_to_diffusers_checkpoint(
|
||||
resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
|
||||
)
|
||||
)
|
||||
|
||||
# upsample
|
||||
|
||||
# there is no up sample on the last up block
|
||||
if diffusers_up_block_idx != len(model.decoder.up_blocks) - 1:
|
||||
# There's a single upsample in the VQ-diffusion checkpoint but a list of downsamples
|
||||
# in the diffusers model.
|
||||
diffusers_downsample_prefix = f"{diffusers_up_block_prefix}.upsamplers.0.conv"
|
||||
downsample_prefix = f"{up_block_prefix}.upsample.conv"
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
|
||||
f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
# attentions
|
||||
|
||||
if hasattr(up_block, "attentions"):
|
||||
for attention_idx, _ in enumerate(up_block.attentions):
|
||||
diffusers_attention_prefix = f"{diffusers_up_block_prefix}.attentions.{attention_idx}"
|
||||
attention_prefix = f"{up_block_prefix}.attn.{attention_idx}"
|
||||
diffusers_checkpoint.update(
|
||||
vqvae_attention_to_diffusers_checkpoint(
|
||||
checkpoint,
|
||||
diffusers_attention_prefix=diffusers_attention_prefix,
|
||||
attention_prefix=attention_prefix,
|
||||
)
|
||||
)
|
||||
|
||||
# mid block
|
||||
|
||||
# mid block attentions
|
||||
|
||||
# There is a single hardcoded attention block in the middle of the VQ-diffusion decoder
|
||||
diffusers_attention_prefix = "decoder.mid_block.attentions.0"
|
||||
attention_prefix = "decoder.mid.attn_1"
|
||||
diffusers_checkpoint.update(
|
||||
vqvae_attention_to_diffusers_checkpoint(
|
||||
checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
|
||||
)
|
||||
)
|
||||
|
||||
# mid block resnets
|
||||
|
||||
for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
|
||||
diffusers_resnet_prefix = f"decoder.mid_block.resnets.{diffusers_resnet_idx}"
|
||||
|
||||
# the hardcoded prefixes to `block_` are 1 and 2
|
||||
orig_resnet_idx = diffusers_resnet_idx + 1
|
||||
# There are two hardcoded resnets in the middle of the VQ-diffusion decoder
|
||||
resnet_prefix = f"decoder.mid.block_{orig_resnet_idx}"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
vqvae_resnet_to_diffusers_checkpoint(
|
||||
resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
|
||||
)
|
||||
)
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
# conv_norm_out
|
||||
"decoder.conv_norm_out.weight": checkpoint["decoder.norm_out.weight"],
|
||||
"decoder.conv_norm_out.bias": checkpoint["decoder.norm_out.bias"],
|
||||
# conv_out
|
||||
"decoder.conv_out.weight": checkpoint["decoder.conv_out.weight"],
|
||||
"decoder.conv_out.bias": checkpoint["decoder.conv_out.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
return diffusers_checkpoint
|
||||
|
||||
|
||||
def vqvae_resnet_to_diffusers_checkpoint(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
|
||||
rv = {
|
||||
# norm1
|
||||
f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.norm1.weight"],
|
||||
f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.norm1.bias"],
|
||||
# conv1
|
||||
f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
|
||||
f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
|
||||
# norm2
|
||||
f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.norm2.weight"],
|
||||
f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.norm2.bias"],
|
||||
# conv2
|
||||
f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
|
||||
f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
|
||||
}
|
||||
|
||||
if resnet.conv_shortcut is not None:
|
||||
rv.update(
|
||||
{
|
||||
f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
|
||||
f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
return rv
|
||||
|
||||
|
||||
def vqvae_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
|
||||
return {
|
||||
# group_norm
|
||||
f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
|
||||
f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
|
||||
# query
|
||||
f"{diffusers_attention_prefix}.query.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
|
||||
f"{diffusers_attention_prefix}.query.bias": checkpoint[f"{attention_prefix}.q.bias"],
|
||||
# key
|
||||
f"{diffusers_attention_prefix}.key.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
|
||||
f"{diffusers_attention_prefix}.key.bias": checkpoint[f"{attention_prefix}.k.bias"],
|
||||
# value
|
||||
f"{diffusers_attention_prefix}.value.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
|
||||
f"{diffusers_attention_prefix}.value.bias": checkpoint[f"{attention_prefix}.v.bias"],
|
||||
# proj_attn
|
||||
f"{diffusers_attention_prefix}.proj_attn.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
|
||||
:, :, 0, 0
|
||||
],
|
||||
f"{diffusers_attention_prefix}.proj_attn.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
|
||||
}
|
||||
|
||||
|
||||
# done vqvae checkpoint
|
||||
|
||||
# transformer model
|
||||
|
||||
PORTED_DIFFUSIONS = ["image_synthesis.modeling.transformers.diffusion_transformer.DiffusionTransformer"]
|
||||
PORTED_TRANSFORMERS = ["image_synthesis.modeling.transformers.transformer_utils.Text2ImageTransformer"]
|
||||
PORTED_CONTENT_EMBEDDINGS = ["image_synthesis.modeling.embeddings.dalle_mask_image_embedding.DalleMaskImageEmbedding"]
|
||||
|
||||
|
||||
def transformer_model_from_original_config(
|
||||
original_diffusion_config, original_transformer_config, original_content_embedding_config
|
||||
):
|
||||
assert (
|
||||
original_diffusion_config.target in PORTED_DIFFUSIONS
|
||||
), f"{original_diffusion_config.target} has not yet been ported to diffusers."
|
||||
assert (
|
||||
original_transformer_config.target in PORTED_TRANSFORMERS
|
||||
), f"{original_transformer_config.target} has not yet been ported to diffusers."
|
||||
assert (
|
||||
original_content_embedding_config.target in PORTED_CONTENT_EMBEDDINGS
|
||||
), f"{original_content_embedding_config.target} has not yet been ported to diffusers."
|
||||
|
||||
original_diffusion_config = original_diffusion_config.params
|
||||
original_transformer_config = original_transformer_config.params
|
||||
original_content_embedding_config = original_content_embedding_config.params
|
||||
|
||||
inner_dim = original_transformer_config["n_embd"]
|
||||
|
||||
n_heads = original_transformer_config["n_head"]
|
||||
|
||||
# VQ-Diffusion gives dimension of the multi-headed attention layers as the
|
||||
# number of attention heads times the sequence length (the dimension) of a
|
||||
# single head. We want to specify our attention blocks with those values
|
||||
# specified separately
|
||||
assert inner_dim % n_heads == 0
|
||||
d_head = inner_dim // n_heads
|
||||
|
||||
depth = original_transformer_config["n_layer"]
|
||||
context_dim = original_transformer_config["condition_dim"]
|
||||
|
||||
num_embed = original_content_embedding_config["num_embed"]
|
||||
# the number of embeddings in the transformer includes the mask embedding.
|
||||
# the content embedding (the vqvae) does not include the mask embedding.
|
||||
num_embed = num_embed + 1
|
||||
|
||||
height = original_transformer_config["content_spatial_size"][0]
|
||||
width = original_transformer_config["content_spatial_size"][1]
|
||||
|
||||
assert width == height, "width has to be equal to height"
|
||||
dropout = original_transformer_config["resid_pdrop"]
|
||||
num_embeds_ada_norm = original_diffusion_config["diffusion_step"]
|
||||
|
||||
model_kwargs = {
|
||||
"attention_bias": True,
|
||||
"cross_attention_dim": context_dim,
|
||||
"attention_head_dim": d_head,
|
||||
"num_layers": depth,
|
||||
"dropout": dropout,
|
||||
"num_attention_heads": n_heads,
|
||||
"num_vector_embeds": num_embed,
|
||||
"num_embeds_ada_norm": num_embeds_ada_norm,
|
||||
"norm_num_groups": 32,
|
||||
"sample_size": width,
|
||||
"activation_fn": "geglu-approximate",
|
||||
}
|
||||
|
||||
model = Transformer2DModel(**model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
# done transformer model
|
||||
|
||||
# transformer checkpoint
|
||||
|
||||
|
||||
def transformer_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
|
||||
diffusers_checkpoint = {}
|
||||
|
||||
transformer_prefix = "transformer.transformer"
|
||||
|
||||
diffusers_latent_image_embedding_prefix = "latent_image_embedding"
|
||||
latent_image_embedding_prefix = f"{transformer_prefix}.content_emb"
|
||||
|
||||
# DalleMaskImageEmbedding
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
f"{diffusers_latent_image_embedding_prefix}.emb.weight": checkpoint[
|
||||
f"{latent_image_embedding_prefix}.emb.weight"
|
||||
],
|
||||
f"{diffusers_latent_image_embedding_prefix}.height_emb.weight": checkpoint[
|
||||
f"{latent_image_embedding_prefix}.height_emb.weight"
|
||||
],
|
||||
f"{diffusers_latent_image_embedding_prefix}.width_emb.weight": checkpoint[
|
||||
f"{latent_image_embedding_prefix}.width_emb.weight"
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
# transformer blocks
|
||||
for transformer_block_idx, transformer_block in enumerate(model.transformer_blocks):
|
||||
diffusers_transformer_block_prefix = f"transformer_blocks.{transformer_block_idx}"
|
||||
transformer_block_prefix = f"{transformer_prefix}.blocks.{transformer_block_idx}"
|
||||
|
||||
# ada norm block
|
||||
diffusers_ada_norm_prefix = f"{diffusers_transformer_block_prefix}.norm1"
|
||||
ada_norm_prefix = f"{transformer_block_prefix}.ln1"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
transformer_ada_norm_to_diffusers_checkpoint(
|
||||
checkpoint, diffusers_ada_norm_prefix=diffusers_ada_norm_prefix, ada_norm_prefix=ada_norm_prefix
|
||||
)
|
||||
)
|
||||
|
||||
# attention block
|
||||
diffusers_attention_prefix = f"{diffusers_transformer_block_prefix}.attn1"
|
||||
attention_prefix = f"{transformer_block_prefix}.attn1"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
transformer_attention_to_diffusers_checkpoint(
|
||||
checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
|
||||
)
|
||||
)
|
||||
|
||||
# ada norm block
|
||||
diffusers_ada_norm_prefix = f"{diffusers_transformer_block_prefix}.norm2"
|
||||
ada_norm_prefix = f"{transformer_block_prefix}.ln1_1"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
transformer_ada_norm_to_diffusers_checkpoint(
|
||||
checkpoint, diffusers_ada_norm_prefix=diffusers_ada_norm_prefix, ada_norm_prefix=ada_norm_prefix
|
||||
)
|
||||
)
|
||||
|
||||
# attention block
|
||||
diffusers_attention_prefix = f"{diffusers_transformer_block_prefix}.attn2"
|
||||
attention_prefix = f"{transformer_block_prefix}.attn2"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
transformer_attention_to_diffusers_checkpoint(
|
||||
checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
|
||||
)
|
||||
)
|
||||
|
||||
# norm block
|
||||
diffusers_norm_block_prefix = f"{diffusers_transformer_block_prefix}.norm3"
|
||||
norm_block_prefix = f"{transformer_block_prefix}.ln2"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
f"{diffusers_norm_block_prefix}.weight": checkpoint[f"{norm_block_prefix}.weight"],
|
||||
f"{diffusers_norm_block_prefix}.bias": checkpoint[f"{norm_block_prefix}.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
# feedforward block
|
||||
diffusers_feedforward_prefix = f"{diffusers_transformer_block_prefix}.ff"
|
||||
feedforward_prefix = f"{transformer_block_prefix}.mlp"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
transformer_feedforward_to_diffusers_checkpoint(
|
||||
checkpoint,
|
||||
diffusers_feedforward_prefix=diffusers_feedforward_prefix,
|
||||
feedforward_prefix=feedforward_prefix,
|
||||
)
|
||||
)
|
||||
|
||||
# to logits
|
||||
|
||||
diffusers_norm_out_prefix = "norm_out"
|
||||
norm_out_prefix = f"{transformer_prefix}.to_logits.0"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
f"{diffusers_norm_out_prefix}.weight": checkpoint[f"{norm_out_prefix}.weight"],
|
||||
f"{diffusers_norm_out_prefix}.bias": checkpoint[f"{norm_out_prefix}.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
diffusers_out_prefix = "out"
|
||||
out_prefix = f"{transformer_prefix}.to_logits.1"
|
||||
|
||||
diffusers_checkpoint.update(
|
||||
{
|
||||
f"{diffusers_out_prefix}.weight": checkpoint[f"{out_prefix}.weight"],
|
||||
f"{diffusers_out_prefix}.bias": checkpoint[f"{out_prefix}.bias"],
|
||||
}
|
||||
)
|
||||
|
||||
return diffusers_checkpoint
|
||||
|
||||
|
||||
def transformer_ada_norm_to_diffusers_checkpoint(checkpoint, *, diffusers_ada_norm_prefix, ada_norm_prefix):
|
||||
return {
|
||||
f"{diffusers_ada_norm_prefix}.emb.weight": checkpoint[f"{ada_norm_prefix}.emb.weight"],
|
||||
f"{diffusers_ada_norm_prefix}.linear.weight": checkpoint[f"{ada_norm_prefix}.linear.weight"],
|
||||
f"{diffusers_ada_norm_prefix}.linear.bias": checkpoint[f"{ada_norm_prefix}.linear.bias"],
|
||||
}
|
||||
|
||||
|
||||
def transformer_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
|
||||
return {
|
||||
# key
|
||||
f"{diffusers_attention_prefix}.to_k.weight": checkpoint[f"{attention_prefix}.key.weight"],
|
||||
f"{diffusers_attention_prefix}.to_k.bias": checkpoint[f"{attention_prefix}.key.bias"],
|
||||
# query
|
||||
f"{diffusers_attention_prefix}.to_q.weight": checkpoint[f"{attention_prefix}.query.weight"],
|
||||
f"{diffusers_attention_prefix}.to_q.bias": checkpoint[f"{attention_prefix}.query.bias"],
|
||||
# value
|
||||
f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.value.weight"],
|
||||
f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.value.bias"],
|
||||
# linear out
|
||||
f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj.weight"],
|
||||
f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj.bias"],
|
||||
}
|
||||
|
||||
|
||||
def transformer_feedforward_to_diffusers_checkpoint(checkpoint, *, diffusers_feedforward_prefix, feedforward_prefix):
|
||||
return {
|
||||
f"{diffusers_feedforward_prefix}.net.0.proj.weight": checkpoint[f"{feedforward_prefix}.0.weight"],
|
||||
f"{diffusers_feedforward_prefix}.net.0.proj.bias": checkpoint[f"{feedforward_prefix}.0.bias"],
|
||||
f"{diffusers_feedforward_prefix}.net.2.weight": checkpoint[f"{feedforward_prefix}.2.weight"],
|
||||
f"{diffusers_feedforward_prefix}.net.2.bias": checkpoint[f"{feedforward_prefix}.2.bias"],
|
||||
}
|
||||
|
||||
|
||||
# done transformer checkpoint
|
||||
|
||||
|
||||
def read_config_file(filename):
|
||||
# The yaml file contains annotations that certain values should
|
||||
# loaded as tuples. By default, OmegaConf will panic when reading
|
||||
# these. Instead, we can manually read the yaml with the FullLoader and then
|
||||
# construct the OmegaConf object.
|
||||
with open(filename) as f:
|
||||
original_config = yaml.load(f, FullLoader)
|
||||
|
||||
return OmegaConf.create(original_config)
|
||||
|
||||
|
||||
# We take separate arguments for the vqvae because the ITHQ vqvae config file
|
||||
# is separate from the config file for the rest of the model.
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--vqvae_checkpoint_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the vqvae checkpoint to convert.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--vqvae_original_config_file",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The YAML config file corresponding to the original architecture for the vqvae.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--original_config_file",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The YAML config file corresponding to the original architecture.",
|
||||
)
|
||||
|
||||
parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
|
||||
|
||||
parser.add_argument(
|
||||
"--checkpoint_load_device",
|
||||
default="cpu",
|
||||
type=str,
|
||||
required=False,
|
||||
help="The device passed to `map_location` when loading checkpoints.",
|
||||
)
|
||||
|
||||
# See link for how ema weights are always selected
|
||||
# https://github.com/microsoft/VQ-Diffusion/blob/3c98e77f721db7c787b76304fa2c96a36c7b00af/inference_VQ_Diffusion.py#L65
|
||||
parser.add_argument(
|
||||
"--no_use_ema",
|
||||
action="store_true",
|
||||
required=False,
|
||||
help=(
|
||||
"Set to not use the ema weights from the original VQ-Diffusion checkpoint. You probably do not want to set"
|
||||
" it as the original VQ-Diffusion always uses the ema weights when loading models."
|
||||
),
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
use_ema = not args.no_use_ema
|
||||
|
||||
print(f"loading checkpoints to {args.checkpoint_load_device}")
|
||||
|
||||
checkpoint_map_location = torch.device(args.checkpoint_load_device)
|
||||
|
||||
# vqvae_model
|
||||
|
||||
print(f"loading vqvae, config: {args.vqvae_original_config_file}, checkpoint: {args.vqvae_checkpoint_path}")
|
||||
|
||||
vqvae_original_config = read_config_file(args.vqvae_original_config_file).model
|
||||
vqvae_checkpoint = torch.load(args.vqvae_checkpoint_path, map_location=checkpoint_map_location)["model"]
|
||||
|
||||
with init_empty_weights():
|
||||
vqvae_model = vqvae_model_from_original_config(vqvae_original_config)
|
||||
|
||||
vqvae_diffusers_checkpoint = vqvae_original_checkpoint_to_diffusers_checkpoint(vqvae_model, vqvae_checkpoint)
|
||||
|
||||
with tempfile.NamedTemporaryFile() as vqvae_diffusers_checkpoint_file:
|
||||
torch.save(vqvae_diffusers_checkpoint, vqvae_diffusers_checkpoint_file.name)
|
||||
del vqvae_diffusers_checkpoint
|
||||
del vqvae_checkpoint
|
||||
load_checkpoint_and_dispatch(vqvae_model, vqvae_diffusers_checkpoint_file.name, device_map="auto")
|
||||
|
||||
print("done loading vqvae")
|
||||
|
||||
# done vqvae_model
|
||||
|
||||
# transformer_model
|
||||
|
||||
print(
|
||||
f"loading transformer, config: {args.original_config_file}, checkpoint: {args.checkpoint_path}, use ema:"
|
||||
f" {use_ema}"
|
||||
)
|
||||
|
||||
original_config = read_config_file(args.original_config_file).model
|
||||
|
||||
diffusion_config = original_config.params.diffusion_config
|
||||
transformer_config = original_config.params.diffusion_config.params.transformer_config
|
||||
content_embedding_config = original_config.params.diffusion_config.params.content_emb_config
|
||||
|
||||
pre_checkpoint = torch.load(args.checkpoint_path, map_location=checkpoint_map_location)
|
||||
|
||||
if use_ema:
|
||||
if "ema" in pre_checkpoint:
|
||||
checkpoint = {}
|
||||
for k, v in pre_checkpoint["model"].items():
|
||||
checkpoint[k] = v
|
||||
|
||||
for k, v in pre_checkpoint["ema"].items():
|
||||
# The ema weights are only used on the transformer. To mimic their key as if they came
|
||||
# from the state_dict for the top level model, we prefix with an additional "transformer."
|
||||
# See the source linked in the args.use_ema config for more information.
|
||||
checkpoint[f"transformer.{k}"] = v
|
||||
else:
|
||||
print("attempted to load ema weights but no ema weights are specified in the loaded checkpoint.")
|
||||
checkpoint = pre_checkpoint["model"]
|
||||
else:
|
||||
checkpoint = pre_checkpoint["model"]
|
||||
|
||||
del pre_checkpoint
|
||||
|
||||
with init_empty_weights():
|
||||
transformer_model = transformer_model_from_original_config(
|
||||
diffusion_config, transformer_config, content_embedding_config
|
||||
)
|
||||
|
||||
diffusers_transformer_checkpoint = transformer_original_checkpoint_to_diffusers_checkpoint(
|
||||
transformer_model, checkpoint
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile() as diffusers_transformer_checkpoint_file:
|
||||
torch.save(diffusers_transformer_checkpoint, diffusers_transformer_checkpoint_file.name)
|
||||
del diffusers_transformer_checkpoint
|
||||
del checkpoint
|
||||
load_checkpoint_and_dispatch(transformer_model, diffusers_transformer_checkpoint_file.name, device_map="auto")
|
||||
|
||||
print("done loading transformer")
|
||||
|
||||
# done transformer_model
|
||||
|
||||
# text encoder
|
||||
|
||||
print("loading CLIP text encoder")
|
||||
|
||||
clip_name = "openai/clip-vit-base-patch32"
|
||||
|
||||
# The original VQ-Diffusion specifies the pad value by the int used in the
|
||||
# returned tokens. Each model uses `0` as the pad value. The transformers clip api
|
||||
# specifies the pad value via the token before it has been tokenized. The `!` pad
|
||||
# token is the same as padding with the `0` pad value.
|
||||
pad_token = "!"
|
||||
|
||||
tokenizer_model = CLIPTokenizer.from_pretrained(clip_name, pad_token=pad_token, device_map="auto")
|
||||
|
||||
assert tokenizer_model.convert_tokens_to_ids(pad_token) == 0
|
||||
|
||||
text_encoder_model = CLIPTextModel.from_pretrained(
|
||||
clip_name,
|
||||
# `CLIPTextModel` does not support device_map="auto"
|
||||
# device_map="auto"
|
||||
)
|
||||
|
||||
print("done loading CLIP text encoder")
|
||||
|
||||
# done text encoder
|
||||
|
||||
# scheduler
|
||||
|
||||
scheduler_model = VQDiffusionScheduler(
|
||||
# the scheduler has the same number of embeddings as the transformer
|
||||
num_vec_classes=transformer_model.num_vector_embeds
|
||||
)
|
||||
|
||||
# done scheduler
|
||||
|
||||
print(f"saving VQ diffusion model, path: {args.dump_path}")
|
||||
|
||||
pipe = VQDiffusionPipeline(
|
||||
vqvae=vqvae_model,
|
||||
transformer=transformer_model,
|
||||
tokenizer=tokenizer_model,
|
||||
text_encoder=text_encoder_model,
|
||||
scheduler=scheduler_model,
|
||||
)
|
||||
pipe.save_pretrained(args.dump_path)
|
||||
|
||||
print("done writing VQ diffusion model")
|
||||
13
setup.py
13
setup.py
@@ -89,11 +89,11 @@ _deps = [
|
||||
"huggingface-hub>=0.10.0",
|
||||
"importlib_metadata",
|
||||
"isort>=5.5.4",
|
||||
"jax>=0.2.8,!=0.3.2",
|
||||
"jaxlib>=0.1.65",
|
||||
"jax>=0.2.8,!=0.3.2,<=0.3.6",
|
||||
"jaxlib>=0.1.65,<=0.3.6",
|
||||
"modelcards>=0.1.4",
|
||||
"numpy",
|
||||
"parameterized",
|
||||
"onnxruntime",
|
||||
"pytest",
|
||||
"pytest-timeout",
|
||||
"pytest-xdist",
|
||||
@@ -178,8 +178,9 @@ extras["quality"] = deps_list("black", "isort", "flake8", "hf-doc-builder")
|
||||
extras["docs"] = deps_list("hf-doc-builder")
|
||||
extras["training"] = deps_list("accelerate", "datasets", "tensorboard", "modelcards")
|
||||
extras["test"] = deps_list(
|
||||
"accelerate",
|
||||
"datasets",
|
||||
"parameterized",
|
||||
"onnxruntime",
|
||||
"pytest",
|
||||
"pytest-timeout",
|
||||
"pytest-xdist",
|
||||
@@ -187,7 +188,7 @@ extras["test"] = deps_list(
|
||||
"torchvision",
|
||||
"transformers"
|
||||
)
|
||||
extras["torch"] = deps_list("torch", "accelerate")
|
||||
extras["torch"] = deps_list("torch")
|
||||
|
||||
if os.name == "nt": # windows
|
||||
extras["flax"] = [] # jax is not supported on windows
|
||||
@@ -210,7 +211,7 @@ install_requires = [
|
||||
|
||||
setup(
|
||||
name="diffusers",
|
||||
version="0.7.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
version="0.5.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
description="Diffusers",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from .utils import (
|
||||
is_accelerate_available,
|
||||
is_flax_available,
|
||||
is_inflect_available,
|
||||
is_onnx_available,
|
||||
@@ -10,23 +9,16 @@ from .utils import (
|
||||
)
|
||||
|
||||
|
||||
__version__ = "0.7.0"
|
||||
__version__ = "0.5.1"
|
||||
|
||||
from .configuration_utils import ConfigMixin
|
||||
from .onnx_utils import OnnxRuntimeModel
|
||||
from .utils import logging
|
||||
|
||||
|
||||
# This will create an extra dummy file "dummy_torch_and_accelerate_objects.py"
|
||||
# TODO: (patil-suraj, anton-l) maybe import everything under is_torch_and_accelerate_available
|
||||
if is_torch_available() and not is_accelerate_available():
|
||||
error_msg = "Please install the `accelerate` library to use Diffusers with PyTorch. You can do so by running `pip install diffusers[torch]`. Or if torch is already installed, you can run `pip install accelerate`." # noqa: E501
|
||||
raise ImportError(error_msg)
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from .modeling_utils import ModelMixin
|
||||
from .models import AutoencoderKL, Transformer2DModel, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel
|
||||
from .models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
|
||||
from .optimization import (
|
||||
get_constant_schedule,
|
||||
get_constant_schedule_with_warmup,
|
||||
@@ -37,28 +29,14 @@ if is_torch_available():
|
||||
get_scheduler,
|
||||
)
|
||||
from .pipeline_utils import DiffusionPipeline
|
||||
from .pipelines import (
|
||||
DanceDiffusionPipeline,
|
||||
DDIMPipeline,
|
||||
DDPMPipeline,
|
||||
KarrasVePipeline,
|
||||
LDMPipeline,
|
||||
PNDMPipeline,
|
||||
RePaintPipeline,
|
||||
ScoreSdeVePipeline,
|
||||
)
|
||||
from .pipelines import DDIMPipeline, DDPMPipeline, KarrasVePipeline, LDMPipeline, PNDMPipeline, ScoreSdeVePipeline
|
||||
from .schedulers import (
|
||||
DDIMScheduler,
|
||||
DDPMScheduler,
|
||||
EulerAncestralDiscreteScheduler,
|
||||
EulerDiscreteScheduler,
|
||||
IPNDMScheduler,
|
||||
KarrasVeScheduler,
|
||||
PNDMScheduler,
|
||||
RePaintScheduler,
|
||||
SchedulerMixin,
|
||||
ScoreSdeVeScheduler,
|
||||
VQDiffusionScheduler,
|
||||
)
|
||||
from .training_utils import EMAModel
|
||||
else:
|
||||
@@ -74,20 +52,13 @@ if is_torch_available() and is_transformers_available():
|
||||
LDMTextToImagePipeline,
|
||||
StableDiffusionImg2ImgPipeline,
|
||||
StableDiffusionInpaintPipeline,
|
||||
StableDiffusionInpaintPipelineLegacy,
|
||||
StableDiffusionPipeline,
|
||||
VQDiffusionPipeline,
|
||||
)
|
||||
else:
|
||||
from .utils.dummy_torch_and_transformers_objects import * # noqa F403
|
||||
|
||||
if is_torch_available() and is_transformers_available() and is_onnx_available():
|
||||
from .pipelines import (
|
||||
OnnxStableDiffusionImg2ImgPipeline,
|
||||
OnnxStableDiffusionInpaintPipeline,
|
||||
OnnxStableDiffusionPipeline,
|
||||
StableDiffusionOnnxPipeline,
|
||||
)
|
||||
from .pipelines import StableDiffusionOnnxPipeline
|
||||
else:
|
||||
from .utils.dummy_torch_and_transformers_and_onnx_objects import * # noqa F403
|
||||
|
||||
|
||||
@@ -13,10 +13,9 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" ConfigMixin base class and utilities."""
|
||||
""" ConfigMixinuration base class and utilities."""
|
||||
import dataclasses
|
||||
import functools
|
||||
import importlib
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
@@ -49,13 +48,9 @@ class ConfigMixin:
|
||||
[`~ConfigMixin.save_config`] (should be overridden by parent class).
|
||||
- **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
|
||||
overridden by parent class).
|
||||
- **_compatible_classes** (`List[str]`) -- A list of classes that are compatible with the parent class, so that
|
||||
`from_config` can be used from a class different than the one used to save the config (should be overridden
|
||||
by parent class).
|
||||
"""
|
||||
config_name = None
|
||||
ignore_for_config = []
|
||||
_compatible_classes = []
|
||||
|
||||
def register_to_config(self, **kwargs):
|
||||
if self.config_name is None:
|
||||
@@ -285,14 +280,9 @@ class ConfigMixin:
|
||||
|
||||
return config_dict
|
||||
|
||||
@staticmethod
|
||||
def _get_init_keys(cls):
|
||||
return set(dict(inspect.signature(cls.__init__).parameters).keys())
|
||||
|
||||
@classmethod
|
||||
def extract_init_dict(cls, config_dict, **kwargs):
|
||||
# 1. Retrieve expected config attributes from __init__ signature
|
||||
expected_keys = cls._get_init_keys(cls)
|
||||
expected_keys = set(dict(inspect.signature(cls.__init__).parameters).keys())
|
||||
expected_keys.remove("self")
|
||||
# remove general kwargs if present in dict
|
||||
if "kwargs" in expected_keys:
|
||||
@@ -302,36 +292,9 @@ class ConfigMixin:
|
||||
for arg in cls._flax_internal_args:
|
||||
expected_keys.remove(arg)
|
||||
|
||||
# 2. Remove attributes that cannot be expected from expected config attributes
|
||||
# remove keys to be ignored
|
||||
if len(cls.ignore_for_config) > 0:
|
||||
expected_keys = expected_keys - set(cls.ignore_for_config)
|
||||
|
||||
# load diffusers library to import compatible and original scheduler
|
||||
diffusers_library = importlib.import_module(__name__.split(".")[0])
|
||||
|
||||
# remove attributes from compatible classes that orig cannot expect
|
||||
compatible_classes = [getattr(diffusers_library, c, None) for c in cls._compatible_classes]
|
||||
# filter out None potentially undefined dummy classes
|
||||
compatible_classes = [c for c in compatible_classes if c is not None]
|
||||
expected_keys_comp_cls = set()
|
||||
for c in compatible_classes:
|
||||
expected_keys_c = cls._get_init_keys(c)
|
||||
expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)
|
||||
expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)
|
||||
config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}
|
||||
|
||||
# remove attributes from orig class that cannot be expected
|
||||
orig_cls_name = config_dict.pop("_class_name", cls.__name__)
|
||||
if orig_cls_name != cls.__name__ and hasattr(diffusers_library, orig_cls_name):
|
||||
orig_cls = getattr(diffusers_library, orig_cls_name)
|
||||
unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys
|
||||
config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}
|
||||
|
||||
# remove private attributes
|
||||
config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
|
||||
|
||||
# 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
|
||||
init_dict = {}
|
||||
for key in expected_keys:
|
||||
if key in kwargs:
|
||||
@@ -341,7 +304,8 @@ class ConfigMixin:
|
||||
# use value from config dict
|
||||
init_dict[key] = config_dict.pop(key)
|
||||
|
||||
# 4. Give nice warning if unexpected values have been passed
|
||||
config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
|
||||
|
||||
if len(config_dict) > 0:
|
||||
logger.warning(
|
||||
f"The config attributes {config_dict} were passed to {cls.__name__}, "
|
||||
@@ -349,16 +313,14 @@ class ConfigMixin:
|
||||
f"{cls.config_name} configuration file."
|
||||
)
|
||||
|
||||
# 5. Give nice info if config attributes are initiliazed to default because they have not been passed
|
||||
unused_kwargs = {**config_dict, **kwargs}
|
||||
|
||||
passed_keys = set(init_dict.keys())
|
||||
if len(expected_keys - passed_keys) > 0:
|
||||
logger.info(
|
||||
f"{expected_keys - passed_keys} was not found in config. Values will be initialized to default values."
|
||||
)
|
||||
|
||||
# 6. Define unused keyword arguments
|
||||
unused_kwargs = {**config_dict, **kwargs}
|
||||
|
||||
return init_dict, unused_kwargs
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -13,11 +13,11 @@ deps = {
|
||||
"huggingface-hub": "huggingface-hub>=0.10.0",
|
||||
"importlib_metadata": "importlib_metadata",
|
||||
"isort": "isort>=5.5.4",
|
||||
"jax": "jax>=0.2.8,!=0.3.2",
|
||||
"jaxlib": "jaxlib>=0.1.65",
|
||||
"jax": "jax>=0.2.8,!=0.3.2,<=0.3.6",
|
||||
"jaxlib": "jaxlib>=0.1.65,<=0.3.6",
|
||||
"modelcards": "modelcards>=0.1.4",
|
||||
"numpy": "numpy",
|
||||
"parameterized": "parameterized",
|
||||
"onnxruntime": "onnxruntime",
|
||||
"pytest": "pytest",
|
||||
"pytest-timeout": "pytest-timeout",
|
||||
"pytest-xdist": "pytest-xdist",
|
||||
|
||||
@@ -168,11 +168,7 @@ def find_pipeline_class(loaded_module):
|
||||
|
||||
pipeline_class = None
|
||||
for cls_name, cls in cls_members.items():
|
||||
if (
|
||||
cls_name != DiffusionPipeline.__name__
|
||||
and issubclass(cls, DiffusionPipeline)
|
||||
and cls.__module__.split(".")[0] != "diffusers"
|
||||
):
|
||||
if cls_name != DiffusionPipeline.__name__ and issubclass(cls, DiffusionPipeline):
|
||||
if pipeline_class is not None:
|
||||
raise ValueError(
|
||||
f"Multiple classes that inherit from {DiffusionPipeline.__name__} have been found:"
|
||||
|
||||
@@ -16,25 +16,13 @@
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Union
|
||||
from uuid import uuid4
|
||||
from typing import Optional
|
||||
|
||||
from huggingface_hub import HfFolder, Repository, whoami
|
||||
|
||||
from . import __version__
|
||||
from .utils import ENV_VARS_TRUE_VALUES, deprecate, logging
|
||||
from .utils.import_utils import (
|
||||
_flax_version,
|
||||
_jax_version,
|
||||
_onnxruntime_version,
|
||||
_torch_version,
|
||||
is_flax_available,
|
||||
is_modelcards_available,
|
||||
is_onnx_available,
|
||||
is_torch_available,
|
||||
)
|
||||
from .pipeline_utils import DiffusionPipeline
|
||||
from .utils import is_modelcards_available, logging
|
||||
|
||||
|
||||
if is_modelcards_available():
|
||||
@@ -45,32 +33,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "utils" / "model_card_template.md"
|
||||
SESSION_ID = uuid4().hex
|
||||
DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", "").upper() in ENV_VARS_TRUE_VALUES
|
||||
|
||||
|
||||
def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
|
||||
"""
|
||||
Formats a user-agent string with basic info about a request.
|
||||
"""
|
||||
ua = f"diffusers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}"
|
||||
if DISABLE_TELEMETRY:
|
||||
return ua + "; telemetry/off"
|
||||
if is_torch_available():
|
||||
ua += f"; torch/{_torch_version}"
|
||||
if is_flax_available():
|
||||
ua += f"; jax/{_jax_version}"
|
||||
ua += f"; flax/{_flax_version}"
|
||||
if is_onnx_available():
|
||||
ua += f"; onnxruntime/{_onnxruntime_version}"
|
||||
# CI will set this value to True
|
||||
if os.environ.get("DIFFUSERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
|
||||
ua += "; is_ci/true"
|
||||
if isinstance(user_agent, dict):
|
||||
ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
|
||||
elif isinstance(user_agent, str):
|
||||
ua += "; " + user_agent
|
||||
return ua
|
||||
|
||||
|
||||
def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
|
||||
@@ -91,12 +53,6 @@ def init_git_repo(args, at_init: bool = False):
|
||||
Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is `True`
|
||||
and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped out.
|
||||
"""
|
||||
deprecation_message = (
|
||||
"Please use `huggingface_hub.Repository`. "
|
||||
"See `examples/unconditional_image_generation/train_unconditional.py` for an example."
|
||||
)
|
||||
deprecate("init_git_repo()", "0.10.0", deprecation_message)
|
||||
|
||||
if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
|
||||
return
|
||||
hub_token = args.hub_token if hasattr(args, "hub_token") else None
|
||||
@@ -139,7 +95,7 @@ def init_git_repo(args, at_init: bool = False):
|
||||
|
||||
def push_to_hub(
|
||||
args,
|
||||
pipeline,
|
||||
pipeline: DiffusionPipeline,
|
||||
repo: Repository,
|
||||
commit_message: Optional[str] = "End of training",
|
||||
blocking: bool = True,
|
||||
@@ -158,11 +114,6 @@ def push_to_hub(
|
||||
The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of the
|
||||
commit and an object to track the progress of the commit if `blocking=True`
|
||||
"""
|
||||
deprecation_message = (
|
||||
"Please use `huggingface_hub.Repository` and `Repository.push_to_hub()`. "
|
||||
"See `examples/unconditional_image_generation/train_unconditional.py` for an example."
|
||||
)
|
||||
deprecate("push_to_hub()", "0.10.0", deprecation_message)
|
||||
|
||||
if not hasattr(args, "hub_model_id") or args.hub_model_id is None:
|
||||
model_name = Path(args.output_dir).name
|
||||
|
||||
@@ -105,14 +105,14 @@ class FlaxModelMixin:
|
||||
>>> from diffusers import FlaxUNet2DConditionModel
|
||||
|
||||
>>> # load model
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
>>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
|
||||
>>> params = model.to_bf16(params)
|
||||
>>> # If you don't want to cast certain parameters (for example layer norm bias and scale)
|
||||
>>> # then pass the mask as follows
|
||||
>>> from flax import traverse_util
|
||||
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
>>> flat_params = traverse_util.flatten_dict(params)
|
||||
>>> mask = {
|
||||
... path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
|
||||
@@ -141,7 +141,7 @@ class FlaxModelMixin:
|
||||
>>> from diffusers import FlaxUNet2DConditionModel
|
||||
|
||||
>>> # Download model and configuration from huggingface.co
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
>>> # By default, the model params will be in fp32, to illustrate the use of this method,
|
||||
>>> # we'll first cast to fp16 and back to fp32
|
||||
>>> params = model.to_f16(params)
|
||||
@@ -171,14 +171,14 @@ class FlaxModelMixin:
|
||||
>>> from diffusers import FlaxUNet2DConditionModel
|
||||
|
||||
>>> # load model
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
>>> # By default, the model params will be in fp32, to cast these to float16
|
||||
>>> params = model.to_fp16(params)
|
||||
>>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
|
||||
>>> # then pass the mask as follows
|
||||
>>> from flax import traverse_util
|
||||
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
>>> flat_params = traverse_util.flatten_dict(params)
|
||||
>>> mask = {
|
||||
... path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
|
||||
@@ -216,7 +216,7 @@ class FlaxModelMixin:
|
||||
|
||||
- A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
|
||||
Valid model ids are namespaced under a user or organization name, like
|
||||
`runwayml/stable-diffusion-v1-5`.
|
||||
`CompVis/stable-diffusion-v1-4`.
|
||||
- A path to a *directory* containing model weights saved using [`~ModelMixin.save_pretrained`],
|
||||
e.g., `./my_model_directory/`.
|
||||
dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
|
||||
@@ -273,7 +273,7 @@ class FlaxModelMixin:
|
||||
>>> from diffusers import FlaxUNet2DConditionModel
|
||||
|
||||
>>> # Download model and configuration from huggingface.co and cache.
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
>>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
|
||||
>>> model, params = FlaxUNet2DConditionModel.from_pretrained("./test/saved_model/")
|
||||
```"""
|
||||
@@ -482,6 +482,29 @@ class FlaxModelMixin:
|
||||
" training."
|
||||
)
|
||||
|
||||
# dictionary of key: dtypes for the model params
|
||||
param_dtypes = jax.tree_map(lambda x: x.dtype, state)
|
||||
# extract keys of parameters not in jnp.float32
|
||||
fp16_params = [k for k in param_dtypes if param_dtypes[k] == jnp.float16]
|
||||
bf16_params = [k for k in param_dtypes if param_dtypes[k] == jnp.bfloat16]
|
||||
|
||||
# raise a warning if any of the parameters are not in jnp.float32
|
||||
if len(fp16_params) > 0:
|
||||
logger.warning(
|
||||
f"Some of the weights of {model.__class__.__name__} were initialized in float16 precision from "
|
||||
f"the model checkpoint at {pretrained_model_name_or_path}:\n{fp16_params}\n"
|
||||
"You should probably UPCAST the model weights to float32 if this was not intended. "
|
||||
"See [`~ModelMixin.to_fp32`] for further information on how to do this."
|
||||
)
|
||||
|
||||
if len(bf16_params) > 0:
|
||||
logger.warning(
|
||||
f"Some of the weights of {model.__class__.__name__} were initialized in bfloat16 precision from "
|
||||
f"the model checkpoint at {pretrained_model_name_or_path}:\n{bf16_params}\n"
|
||||
"You should probably UPCAST the model weights to float32 if this was not intended. "
|
||||
"See [`~ModelMixin.to_fp32`] for further information on how to do this."
|
||||
)
|
||||
|
||||
return model, unflatten_dict(state)
|
||||
|
||||
def save_pretrained(
|
||||
|
||||
@@ -21,9 +21,7 @@ from typing import Callable, List, Optional, Tuple, Union
|
||||
import torch
|
||||
from torch import Tensor, device
|
||||
|
||||
import accelerate
|
||||
from accelerate.utils import set_module_tensor_to_device
|
||||
from accelerate.utils.versions import is_torch_version
|
||||
from diffusers.utils import is_accelerate_available
|
||||
from huggingface_hub import hf_hub_download
|
||||
from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
|
||||
from requests import HTTPError
|
||||
@@ -35,12 +33,6 @@ from .utils import CONFIG_NAME, DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
if is_torch_version(">=", "1.9.0"):
|
||||
_LOW_CPU_MEM_USAGE_DEFAULT = True
|
||||
else:
|
||||
_LOW_CPU_MEM_USAGE_DEFAULT = False
|
||||
|
||||
|
||||
def get_parameter_device(parameter: torch.nn.Module):
|
||||
try:
|
||||
return next(parameter.parameters()).device
|
||||
@@ -276,19 +268,6 @@ class ModelMixin(torch.nn.Module):
|
||||
Mirror source to accelerate downloads in China. If you are from China and have an accessibility
|
||||
problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
|
||||
Please refer to the mirror site for more information.
|
||||
device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
|
||||
A map that specifies where each submodule should go. It doesn't need to be refined to each
|
||||
parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
|
||||
same device.
|
||||
|
||||
To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
|
||||
more information about each option see [designing a device
|
||||
map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
|
||||
low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
|
||||
Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
|
||||
also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
|
||||
model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
|
||||
setting this argument to `True` will raise an error.
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -317,26 +296,6 @@ class ModelMixin(torch.nn.Module):
|
||||
torch_dtype = kwargs.pop("torch_dtype", None)
|
||||
subfolder = kwargs.pop("subfolder", None)
|
||||
device_map = kwargs.pop("device_map", None)
|
||||
low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
|
||||
|
||||
# Check if we can handle device_map and dispatching the weights
|
||||
if device_map is not None and not is_torch_version(">=", "1.9.0"):
|
||||
raise NotImplementedError(
|
||||
"Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
|
||||
" `device_map=None`."
|
||||
)
|
||||
|
||||
if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
|
||||
raise NotImplementedError(
|
||||
"Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
|
||||
" `low_cpu_mem_usage=False`."
|
||||
)
|
||||
|
||||
if low_cpu_mem_usage is False and device_map is not None:
|
||||
raise ValueError(
|
||||
f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
|
||||
" dispatching. Please make sure to set `low_cpu_mem_usage=True`."
|
||||
)
|
||||
|
||||
user_agent = {
|
||||
"diffusers": __version__,
|
||||
@@ -419,8 +378,12 @@ class ModelMixin(torch.nn.Module):
|
||||
|
||||
# restore default dtype
|
||||
|
||||
if low_cpu_mem_usage:
|
||||
# Instantiate model with empty weights
|
||||
if device_map == "auto":
|
||||
if is_accelerate_available():
|
||||
import accelerate
|
||||
else:
|
||||
raise ImportError("Please install accelerate via `pip install accelerate`")
|
||||
|
||||
with accelerate.init_empty_weights():
|
||||
model, unused_kwargs = cls.from_config(
|
||||
config_path,
|
||||
@@ -437,17 +400,7 @@ class ModelMixin(torch.nn.Module):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# if device_map is Non,e load the state dict on move the params from meta device to the cpu
|
||||
if device_map is None:
|
||||
param_device = "cpu"
|
||||
state_dict = load_state_dict(model_file)
|
||||
# move the parms from meta device to cpu
|
||||
for param_name, param in state_dict.items():
|
||||
set_module_tensor_to_device(model, param_name, param_device, value=param)
|
||||
else: # else let accelerate handle loading and dispatching.
|
||||
# Load weights and dispatch according to the device_map
|
||||
# by deafult the device_map is None and the weights are loaded on the CPU
|
||||
accelerate.load_checkpoint_and_dispatch(model, model_file, device_map)
|
||||
accelerate.load_checkpoint_and_dispatch(model, model_file, device_map)
|
||||
|
||||
loading_info = {
|
||||
"missing_keys": [],
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
# Models
|
||||
|
||||
For more detail on the models, please refer to the [docs](https://huggingface.co/docs/diffusers/api/models).
|
||||
- Models: Neural network that models $p_\theta(\mathbf{x}_{t-1}|\mathbf{x}_t)$ (see image below) and is trained end-to-end to denoise a noisy input to an image. Examples: UNet, Conditioned UNet, 3D UNet, Transformer UNet
|
||||
|
||||
## API
|
||||
|
||||
TODO(Suraj, Patrick)
|
||||
|
||||
## Examples
|
||||
|
||||
TODO(Suraj, Patrick)
|
||||
|
||||
@@ -16,8 +16,6 @@ from ..utils import is_flax_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from .attention import Transformer2DModel
|
||||
from .unet_1d import UNet1DModel
|
||||
from .unet_2d import UNet2DModel
|
||||
from .unet_2d_condition import UNet2DConditionModel
|
||||
from .vae import AutoencoderKL, VQModel
|
||||
|
||||
@@ -12,218 +12,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from ..configuration_utils import ConfigMixin, register_to_config
|
||||
from ..modeling_utils import ModelMixin
|
||||
from ..models.embeddings import ImagePositionalEmbeddings
|
||||
from ..utils import BaseOutput
|
||||
from ..utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
@dataclass
|
||||
class Transformer2DModelOutput(BaseOutput):
|
||||
"""
|
||||
Args:
|
||||
sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
|
||||
Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
|
||||
for the unnoised latent pixels.
|
||||
"""
|
||||
|
||||
sample: torch.FloatTensor
|
||||
|
||||
|
||||
if is_xformers_available():
|
||||
import xformers
|
||||
import xformers.ops
|
||||
else:
|
||||
xformers = None
|
||||
|
||||
|
||||
class Transformer2DModel(ModelMixin, ConfigMixin):
|
||||
"""
|
||||
Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
|
||||
embeddings) inputs.
|
||||
|
||||
When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
|
||||
transformer action. Finally, reshape to image.
|
||||
|
||||
When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
|
||||
embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
|
||||
classes of unnoised image.
|
||||
|
||||
Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
|
||||
image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.
|
||||
|
||||
Parameters:
|
||||
num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
|
||||
attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
|
||||
in_channels (`int`, *optional*):
|
||||
Pass if the input is continuous. The number of channels in the input and output.
|
||||
num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
|
||||
dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
|
||||
cross_attention_dim (`int`, *optional*): The number of context dimensions to use.
|
||||
sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
|
||||
Note that this is fixed at training time as it is used for learning a number of position embeddings. See
|
||||
`ImagePositionalEmbeddings`.
|
||||
num_vector_embeds (`int`, *optional*):
|
||||
Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
|
||||
Includes the class for the masked latent pixel.
|
||||
activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
|
||||
num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
|
||||
The number of diffusion steps used during training. Note that this is fixed at training time as it is used
|
||||
to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
|
||||
up to but not more than steps than `num_embeds_ada_norm`.
|
||||
attention_bias (`bool`, *optional*):
|
||||
Configure if the TransformerBlocks' attention should contain a bias parameter.
|
||||
"""
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
num_attention_heads: int = 16,
|
||||
attention_head_dim: int = 88,
|
||||
in_channels: Optional[int] = None,
|
||||
num_layers: int = 1,
|
||||
dropout: float = 0.0,
|
||||
norm_num_groups: int = 32,
|
||||
cross_attention_dim: Optional[int] = None,
|
||||
attention_bias: bool = False,
|
||||
sample_size: Optional[int] = None,
|
||||
num_vector_embeds: Optional[int] = None,
|
||||
activation_fn: str = "geglu",
|
||||
num_embeds_ada_norm: Optional[int] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.attention_head_dim = attention_head_dim
|
||||
inner_dim = num_attention_heads * attention_head_dim
|
||||
|
||||
# 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
|
||||
# Define whether input is continuous or discrete depending on configuration
|
||||
self.is_input_continuous = in_channels is not None
|
||||
self.is_input_vectorized = num_vector_embeds is not None
|
||||
|
||||
if self.is_input_continuous and self.is_input_vectorized:
|
||||
raise ValueError(
|
||||
f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
|
||||
" sure that either `in_channels` or `num_vector_embeds` is None."
|
||||
)
|
||||
elif not self.is_input_continuous and not self.is_input_vectorized:
|
||||
raise ValueError(
|
||||
f"Has to define either `in_channels`: {in_channels} or `num_vector_embeds`: {num_vector_embeds}. Make"
|
||||
" sure that either `in_channels` or `num_vector_embeds` is not None."
|
||||
)
|
||||
|
||||
# 2. Define input layers
|
||||
if self.is_input_continuous:
|
||||
self.in_channels = in_channels
|
||||
|
||||
self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
|
||||
self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
|
||||
elif self.is_input_vectorized:
|
||||
assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
|
||||
assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
|
||||
|
||||
self.height = sample_size
|
||||
self.width = sample_size
|
||||
self.num_vector_embeds = num_vector_embeds
|
||||
self.num_latent_pixels = self.height * self.width
|
||||
|
||||
self.latent_image_embedding = ImagePositionalEmbeddings(
|
||||
num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
|
||||
)
|
||||
|
||||
# 3. Define transformers blocks
|
||||
self.transformer_blocks = nn.ModuleList(
|
||||
[
|
||||
BasicTransformerBlock(
|
||||
inner_dim,
|
||||
num_attention_heads,
|
||||
attention_head_dim,
|
||||
dropout=dropout,
|
||||
cross_attention_dim=cross_attention_dim,
|
||||
activation_fn=activation_fn,
|
||||
num_embeds_ada_norm=num_embeds_ada_norm,
|
||||
attention_bias=attention_bias,
|
||||
)
|
||||
for d in range(num_layers)
|
||||
]
|
||||
)
|
||||
|
||||
# 4. Define output layers
|
||||
if self.is_input_continuous:
|
||||
self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
|
||||
elif self.is_input_vectorized:
|
||||
self.norm_out = nn.LayerNorm(inner_dim)
|
||||
self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
|
||||
|
||||
def _set_attention_slice(self, slice_size):
|
||||
for block in self.transformer_blocks:
|
||||
block._set_attention_slice(slice_size)
|
||||
|
||||
def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
|
||||
"""
|
||||
Args:
|
||||
hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
|
||||
When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
|
||||
hidden_states
|
||||
encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*):
|
||||
Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
|
||||
self-attention.
|
||||
timestep ( `torch.long`, *optional*):
|
||||
Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
|
||||
|
||||
Returns:
|
||||
[`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
|
||||
if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
|
||||
tensor.
|
||||
"""
|
||||
# 1. Input
|
||||
if self.is_input_continuous:
|
||||
batch, channel, height, weight = hidden_states.shape
|
||||
residual = hidden_states
|
||||
hidden_states = self.norm(hidden_states)
|
||||
hidden_states = self.proj_in(hidden_states)
|
||||
inner_dim = hidden_states.shape[1]
|
||||
hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
|
||||
elif self.is_input_vectorized:
|
||||
hidden_states = self.latent_image_embedding(hidden_states)
|
||||
|
||||
# 2. Blocks
|
||||
for block in self.transformer_blocks:
|
||||
hidden_states = block(hidden_states, context=encoder_hidden_states, timestep=timestep)
|
||||
|
||||
# 3. Output
|
||||
if self.is_input_continuous:
|
||||
hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
|
||||
hidden_states = self.proj_out(hidden_states)
|
||||
output = hidden_states + residual
|
||||
elif self.is_input_vectorized:
|
||||
hidden_states = self.norm_out(hidden_states)
|
||||
logits = self.out(hidden_states)
|
||||
# (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
|
||||
logits = logits.permute(0, 2, 1)
|
||||
|
||||
# log(p(x_0))
|
||||
output = F.log_softmax(logits.double(), dim=1).float()
|
||||
|
||||
if not return_dict:
|
||||
return (output,)
|
||||
|
||||
return Transformer2DModelOutput(sample=output)
|
||||
|
||||
def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for block in self.transformer_blocks:
|
||||
block._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
|
||||
class AttentionBlock(nn.Module):
|
||||
"""
|
||||
@@ -233,19 +27,19 @@ class AttentionBlock(nn.Module):
|
||||
Uses three q, k, v linear layers to compute attention.
|
||||
|
||||
Parameters:
|
||||
channels (`int`): The number of channels in the input and output.
|
||||
num_head_channels (`int`, *optional*):
|
||||
channels (:obj:`int`): The number of channels in the input and output.
|
||||
num_head_channels (:obj:`int`, *optional*):
|
||||
The number of channels in each head. If None, then `num_heads` = 1.
|
||||
norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for group norm.
|
||||
rescale_output_factor (`float`, *optional*, defaults to 1.0): The factor to rescale the output by.
|
||||
eps (`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
|
||||
num_groups (:obj:`int`, *optional*, defaults to 32): The number of groups to use for group norm.
|
||||
rescale_output_factor (:obj:`float`, *optional*, defaults to 1.0): The factor to rescale the output by.
|
||||
eps (:obj:`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
channels: int,
|
||||
num_head_channels: Optional[int] = None,
|
||||
norm_num_groups: int = 32,
|
||||
num_groups: int = 32,
|
||||
rescale_output_factor: float = 1.0,
|
||||
eps: float = 1e-5,
|
||||
):
|
||||
@@ -254,7 +48,7 @@ class AttentionBlock(nn.Module):
|
||||
|
||||
self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
|
||||
self.num_head_size = num_head_channels
|
||||
self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)
|
||||
self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=num_groups, eps=eps, affine=True)
|
||||
|
||||
# define q,k,v as linear layers
|
||||
self.query = nn.Linear(channels, channels)
|
||||
@@ -310,108 +104,113 @@ class AttentionBlock(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class SpatialTransformer(nn.Module):
|
||||
"""
|
||||
Transformer block for image-like data. First, project the input (aka embedding) and reshape to b, t, d. Then apply
|
||||
standard transformer action. Finally, reshape to image.
|
||||
|
||||
Parameters:
|
||||
in_channels (:obj:`int`): The number of channels in the input and output.
|
||||
n_heads (:obj:`int`): The number of heads to use for multi-head attention.
|
||||
d_head (:obj:`int`): The number of channels in each head.
|
||||
depth (:obj:`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
|
||||
dropout (:obj:`float`, *optional*, defaults to 0.1): The dropout probability to use.
|
||||
context_dim (:obj:`int`, *optional*): The number of context dimensions to use.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
n_heads: int,
|
||||
d_head: int,
|
||||
depth: int = 1,
|
||||
dropout: float = 0.0,
|
||||
num_groups: int = 32,
|
||||
context_dim: Optional[int] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.n_heads = n_heads
|
||||
self.d_head = d_head
|
||||
self.in_channels = in_channels
|
||||
inner_dim = n_heads * d_head
|
||||
self.norm = torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
|
||||
|
||||
self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
|
||||
|
||||
self.transformer_blocks = nn.ModuleList(
|
||||
[
|
||||
BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
|
||||
for d in range(depth)
|
||||
]
|
||||
)
|
||||
|
||||
self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
|
||||
|
||||
def _set_attention_slice(self, slice_size):
|
||||
for block in self.transformer_blocks:
|
||||
block._set_attention_slice(slice_size)
|
||||
|
||||
def forward(self, hidden_states, context=None):
|
||||
# note: if no context is given, cross-attention defaults to self-attention
|
||||
batch, channel, height, weight = hidden_states.shape
|
||||
residual = hidden_states
|
||||
hidden_states = self.norm(hidden_states)
|
||||
hidden_states = self.proj_in(hidden_states)
|
||||
inner_dim = hidden_states.shape[1]
|
||||
hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
|
||||
for block in self.transformer_blocks:
|
||||
hidden_states = block(hidden_states, context=context)
|
||||
hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
|
||||
hidden_states = self.proj_out(hidden_states)
|
||||
return hidden_states + residual
|
||||
|
||||
|
||||
class BasicTransformerBlock(nn.Module):
|
||||
r"""
|
||||
A basic Transformer block.
|
||||
|
||||
Parameters:
|
||||
dim (`int`): The number of channels in the input and output.
|
||||
num_attention_heads (`int`): The number of heads to use for multi-head attention.
|
||||
attention_head_dim (`int`): The number of channels in each head.
|
||||
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
|
||||
cross_attention_dim (`int`, *optional*): The size of the context vector for cross attention.
|
||||
activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
|
||||
num_embeds_ada_norm (:
|
||||
obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
|
||||
attention_bias (:
|
||||
obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
|
||||
dim (:obj:`int`): The number of channels in the input and output.
|
||||
n_heads (:obj:`int`): The number of heads to use for multi-head attention.
|
||||
d_head (:obj:`int`): The number of channels in each head.
|
||||
dropout (:obj:`float`, *optional*, defaults to 0.0): The dropout probability to use.
|
||||
context_dim (:obj:`int`, *optional*): The size of the context vector for cross attention.
|
||||
gated_ff (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether to use a gated feed-forward network.
|
||||
checkpoint (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether to use checkpointing.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim: int,
|
||||
num_attention_heads: int,
|
||||
attention_head_dim: int,
|
||||
n_heads: int,
|
||||
d_head: int,
|
||||
dropout=0.0,
|
||||
cross_attention_dim: Optional[int] = None,
|
||||
activation_fn: str = "geglu",
|
||||
num_embeds_ada_norm: Optional[int] = None,
|
||||
attention_bias: bool = False,
|
||||
context_dim: Optional[int] = None,
|
||||
gated_ff: bool = True,
|
||||
checkpoint: bool = True,
|
||||
):
|
||||
super().__init__()
|
||||
self.attn1 = CrossAttention(
|
||||
query_dim=dim,
|
||||
heads=num_attention_heads,
|
||||
dim_head=attention_head_dim,
|
||||
dropout=dropout,
|
||||
bias=attention_bias,
|
||||
query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
|
||||
) # is a self-attention
|
||||
self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
|
||||
self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
|
||||
self.attn2 = CrossAttention(
|
||||
query_dim=dim,
|
||||
cross_attention_dim=cross_attention_dim,
|
||||
heads=num_attention_heads,
|
||||
dim_head=attention_head_dim,
|
||||
dropout=dropout,
|
||||
bias=attention_bias,
|
||||
query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout
|
||||
) # is self-attn if context is none
|
||||
|
||||
# layer norms
|
||||
self.use_ada_layer_norm = num_embeds_ada_norm is not None
|
||||
if self.use_ada_layer_norm:
|
||||
self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
|
||||
self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
|
||||
else:
|
||||
self.norm1 = nn.LayerNorm(dim)
|
||||
self.norm2 = nn.LayerNorm(dim)
|
||||
self.norm1 = nn.LayerNorm(dim)
|
||||
self.norm2 = nn.LayerNorm(dim)
|
||||
self.norm3 = nn.LayerNorm(dim)
|
||||
self.checkpoint = checkpoint
|
||||
|
||||
def _set_attention_slice(self, slice_size):
|
||||
self.attn1._slice_size = slice_size
|
||||
self.attn2._slice_size = slice_size
|
||||
|
||||
def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
if not is_xformers_available():
|
||||
print("Here is how to install it")
|
||||
raise ModuleNotFoundError(
|
||||
"Refer to https://github.com/facebookresearch/xformers for more information on how to install"
|
||||
" xformers",
|
||||
name="xformers",
|
||||
)
|
||||
elif not torch.cuda.is_available():
|
||||
raise ValueError(
|
||||
"torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
|
||||
" available for GPU "
|
||||
)
|
||||
else:
|
||||
try:
|
||||
# Make sure we can run the memory efficient attention
|
||||
_ = xformers.ops.memory_efficient_attention(
|
||||
torch.randn((1, 2, 40), device="cuda"),
|
||||
torch.randn((1, 2, 40), device="cuda"),
|
||||
torch.randn((1, 2, 40), device="cuda"),
|
||||
)
|
||||
except Exception as e:
|
||||
raise e
|
||||
self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
|
||||
self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
|
||||
|
||||
def forward(self, hidden_states, context=None, timestep=None):
|
||||
# 1. Self-Attention
|
||||
norm_hidden_states = (
|
||||
self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
|
||||
)
|
||||
hidden_states = self.attn1(norm_hidden_states) + hidden_states
|
||||
|
||||
# 2. Cross-Attention
|
||||
norm_hidden_states = (
|
||||
self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
|
||||
)
|
||||
hidden_states = self.attn2(norm_hidden_states, context=context) + hidden_states
|
||||
|
||||
# 3. Feed-forward
|
||||
def forward(self, hidden_states, context=None):
|
||||
hidden_states = hidden_states.contiguous() if hidden_states.device.type == "mps" else hidden_states
|
||||
hidden_states = self.attn1(self.norm1(hidden_states)) + hidden_states
|
||||
hidden_states = self.attn2(self.norm2(hidden_states), context=context) + hidden_states
|
||||
hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
@@ -420,28 +219,20 @@ class CrossAttention(nn.Module):
|
||||
A cross attention layer.
|
||||
|
||||
Parameters:
|
||||
query_dim (`int`): The number of channels in the query.
|
||||
cross_attention_dim (`int`, *optional*):
|
||||
query_dim (:obj:`int`): The number of channels in the query.
|
||||
context_dim (:obj:`int`, *optional*):
|
||||
The number of channels in the context. If not given, defaults to `query_dim`.
|
||||
heads (`int`, *optional*, defaults to 8): The number of heads to use for multi-head attention.
|
||||
dim_head (`int`, *optional*, defaults to 64): The number of channels in each head.
|
||||
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
|
||||
bias (`bool`, *optional*, defaults to False):
|
||||
Set to `True` for the query, key, and value linear layers to contain a bias parameter.
|
||||
heads (:obj:`int`, *optional*, defaults to 8): The number of heads to use for multi-head attention.
|
||||
dim_head (:obj:`int`, *optional*, defaults to 64): The number of channels in each head.
|
||||
dropout (:obj:`float`, *optional*, defaults to 0.0): The dropout probability to use.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
query_dim: int,
|
||||
cross_attention_dim: Optional[int] = None,
|
||||
heads: int = 8,
|
||||
dim_head: int = 64,
|
||||
dropout: float = 0.0,
|
||||
bias=False,
|
||||
self, query_dim: int, context_dim: Optional[int] = None, heads: int = 8, dim_head: int = 64, dropout: int = 0.0
|
||||
):
|
||||
super().__init__()
|
||||
inner_dim = dim_head * heads
|
||||
cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
|
||||
context_dim = context_dim if context_dim is not None else query_dim
|
||||
|
||||
self.scale = dim_head**-0.5
|
||||
self.heads = heads
|
||||
@@ -449,15 +240,12 @@ class CrossAttention(nn.Module):
|
||||
# is split across the batch axis to save memory
|
||||
# You can set slice_size with `set_attention_slice`
|
||||
self._slice_size = None
|
||||
self._use_memory_efficient_attention_xformers = False
|
||||
|
||||
self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
|
||||
self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
|
||||
self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
|
||||
self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
|
||||
self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
|
||||
self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
|
||||
|
||||
self.to_out = nn.ModuleList([])
|
||||
self.to_out.append(nn.Linear(inner_dim, query_dim))
|
||||
self.to_out.append(nn.Dropout(dropout))
|
||||
self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
|
||||
|
||||
def reshape_heads_to_batch_dim(self, tensor):
|
||||
batch_size, seq_len, dim = tensor.shape
|
||||
@@ -490,35 +278,20 @@ class CrossAttention(nn.Module):
|
||||
# TODO(PVP) - mask is currently never used. Remember to re-implement when used
|
||||
|
||||
# attention, what we cannot get enough of
|
||||
if self._use_memory_efficient_attention_xformers:
|
||||
hidden_states = self._memory_efficient_attention_xformers(query, key, value)
|
||||
else:
|
||||
if self._slice_size is None or query.shape[0] // self._slice_size == 1:
|
||||
hidden_states = self._attention(query, key, value)
|
||||
else:
|
||||
hidden_states = self._sliced_attention(query, key, value, sequence_length, dim)
|
||||
|
||||
# linear proj
|
||||
hidden_states = self.to_out[0](hidden_states)
|
||||
# dropout
|
||||
hidden_states = self.to_out[1](hidden_states)
|
||||
return hidden_states
|
||||
if self._slice_size is None or query.shape[0] // self._slice_size == 1:
|
||||
hidden_states = self._attention(query, key, value)
|
||||
else:
|
||||
hidden_states = self._sliced_attention(query, key, value, sequence_length, dim)
|
||||
|
||||
return self.to_out(hidden_states)
|
||||
|
||||
def _attention(self, query, key, value):
|
||||
# TODO: use baddbmm for better performance
|
||||
if query.device.type == "mps":
|
||||
# Better performance on mps (~20-25%)
|
||||
attention_scores = torch.einsum("b i d, b j d -> b i j", query, key) * self.scale
|
||||
else:
|
||||
attention_scores = torch.matmul(query, key.transpose(-1, -2)) * self.scale
|
||||
attention_scores = torch.matmul(query, key.transpose(-1, -2)) * self.scale
|
||||
attention_probs = attention_scores.softmax(dim=-1)
|
||||
# compute attention output
|
||||
|
||||
if query.device.type == "mps":
|
||||
hidden_states = torch.einsum("b i j, b j d -> b i d", attention_probs, value)
|
||||
else:
|
||||
hidden_states = torch.matmul(attention_probs, value)
|
||||
|
||||
hidden_states = torch.matmul(attention_probs, value)
|
||||
# reshape hidden_states
|
||||
hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
|
||||
return hidden_states
|
||||
@@ -532,21 +305,11 @@ class CrossAttention(nn.Module):
|
||||
for i in range(hidden_states.shape[0] // slice_size):
|
||||
start_idx = i * slice_size
|
||||
end_idx = (i + 1) * slice_size
|
||||
if query.device.type == "mps":
|
||||
# Better performance on mps (~20-25%)
|
||||
attn_slice = (
|
||||
torch.einsum("b i d, b j d -> b i j", query[start_idx:end_idx], key[start_idx:end_idx])
|
||||
* self.scale
|
||||
)
|
||||
else:
|
||||
attn_slice = (
|
||||
torch.matmul(query[start_idx:end_idx], key[start_idx:end_idx].transpose(1, 2)) * self.scale
|
||||
) # TODO: use baddbmm for better performance
|
||||
attn_slice = (
|
||||
torch.matmul(query[start_idx:end_idx], key[start_idx:end_idx].transpose(1, 2)) * self.scale
|
||||
) # TODO: use baddbmm for better performance
|
||||
attn_slice = attn_slice.softmax(dim=-1)
|
||||
if query.device.type == "mps":
|
||||
attn_slice = torch.einsum("b i j, b j d -> b i d", attn_slice, value[start_idx:end_idx])
|
||||
else:
|
||||
attn_slice = torch.matmul(attn_slice, value[start_idx:end_idx])
|
||||
attn_slice = torch.matmul(attn_slice, value[start_idx:end_idx])
|
||||
|
||||
hidden_states[start_idx:end_idx] = attn_slice
|
||||
|
||||
@@ -554,53 +317,31 @@ class CrossAttention(nn.Module):
|
||||
hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def _memory_efficient_attention_xformers(self, query, key, value):
|
||||
hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=None)
|
||||
hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class FeedForward(nn.Module):
|
||||
r"""
|
||||
A feed-forward layer.
|
||||
|
||||
Parameters:
|
||||
dim (`int`): The number of channels in the input.
|
||||
dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
|
||||
mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
|
||||
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
|
||||
activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
|
||||
dim (:obj:`int`): The number of channels in the input.
|
||||
dim_out (:obj:`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
|
||||
mult (:obj:`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
|
||||
glu (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether to use GLU activation.
|
||||
dropout (:obj:`float`, *optional*, defaults to 0.0): The dropout probability to use.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim: int,
|
||||
dim_out: Optional[int] = None,
|
||||
mult: int = 4,
|
||||
dropout: float = 0.0,
|
||||
activation_fn: str = "geglu",
|
||||
self, dim: int, dim_out: Optional[int] = None, mult: int = 4, glu: bool = False, dropout: float = 0.0
|
||||
):
|
||||
super().__init__()
|
||||
inner_dim = int(dim * mult)
|
||||
dim_out = dim_out if dim_out is not None else dim
|
||||
project_in = GEGLU(dim, inner_dim)
|
||||
|
||||
if activation_fn == "geglu":
|
||||
geglu = GEGLU(dim, inner_dim)
|
||||
elif activation_fn == "geglu-approximate":
|
||||
geglu = ApproximateGELU(dim, inner_dim)
|
||||
|
||||
self.net = nn.ModuleList([])
|
||||
# project in
|
||||
self.net.append(geglu)
|
||||
# project dropout
|
||||
self.net.append(nn.Dropout(dropout))
|
||||
# project out
|
||||
self.net.append(nn.Linear(inner_dim, dim_out))
|
||||
self.net = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
|
||||
|
||||
def forward(self, hidden_states):
|
||||
for module in self.net:
|
||||
hidden_states = module(hidden_states)
|
||||
return hidden_states
|
||||
return self.net(hidden_states)
|
||||
|
||||
|
||||
# feedforward
|
||||
@@ -609,55 +350,14 @@ class GEGLU(nn.Module):
|
||||
A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
|
||||
|
||||
Parameters:
|
||||
dim_in (`int`): The number of channels in the input.
|
||||
dim_out (`int`): The number of channels in the output.
|
||||
dim_in (:obj:`int`): The number of channels in the input.
|
||||
dim_out (:obj:`int`): The number of channels in the output.
|
||||
"""
|
||||
|
||||
def __init__(self, dim_in: int, dim_out: int):
|
||||
super().__init__()
|
||||
self.proj = nn.Linear(dim_in, dim_out * 2)
|
||||
|
||||
def gelu(self, gate):
|
||||
if gate.device.type != "mps":
|
||||
return F.gelu(gate)
|
||||
# mps: gelu is not implemented for float16
|
||||
return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
|
||||
return hidden_states * self.gelu(gate)
|
||||
|
||||
|
||||
class ApproximateGELU(nn.Module):
|
||||
"""
|
||||
The approximate form of Gaussian Error Linear Unit (GELU)
|
||||
|
||||
For more details, see section 2: https://arxiv.org/abs/1606.08415
|
||||
"""
|
||||
|
||||
def __init__(self, dim_in: int, dim_out: int):
|
||||
super().__init__()
|
||||
self.proj = nn.Linear(dim_in, dim_out)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.proj(x)
|
||||
return x * torch.sigmoid(1.702 * x)
|
||||
|
||||
|
||||
class AdaLayerNorm(nn.Module):
|
||||
"""
|
||||
Norm layer modified to incorporate timestep embeddings.
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_dim, num_embeddings):
|
||||
super().__init__()
|
||||
self.emb = nn.Embedding(num_embeddings, embedding_dim)
|
||||
self.silu = nn.SiLU()
|
||||
self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
|
||||
self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
|
||||
|
||||
def forward(self, x, timestep):
|
||||
emb = self.linear(self.silu(self.emb(timestep)))
|
||||
scale, shift = torch.chunk(emb, 2)
|
||||
x = self.norm(x) * (1 + scale) + shift
|
||||
return x
|
||||
return hidden_states * F.gelu(gate)
|
||||
|
||||
@@ -142,7 +142,7 @@ class FlaxBasicTransformerBlock(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class FlaxTransformer2DModel(nn.Module):
|
||||
class FlaxSpatialTransformer(nn.Module):
|
||||
r"""
|
||||
A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
|
||||
https://arxiv.org/pdf/1506.02025.pdf
|
||||
|
||||
@@ -101,93 +101,17 @@ class Timesteps(nn.Module):
|
||||
class GaussianFourierProjection(nn.Module):
|
||||
"""Gaussian Fourier embeddings for noise levels."""
|
||||
|
||||
def __init__(
|
||||
self, embedding_size: int = 256, scale: float = 1.0, set_W_to_weight=True, log=True, flip_sin_to_cos=False
|
||||
):
|
||||
def __init__(self, embedding_size: int = 256, scale: float = 1.0):
|
||||
super().__init__()
|
||||
self.weight = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
|
||||
self.log = log
|
||||
self.flip_sin_to_cos = flip_sin_to_cos
|
||||
|
||||
if set_W_to_weight:
|
||||
# to delete later
|
||||
self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
|
||||
# to delete later
|
||||
self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
|
||||
|
||||
self.weight = self.W
|
||||
self.weight = self.W
|
||||
|
||||
def forward(self, x):
|
||||
if self.log:
|
||||
x = torch.log(x)
|
||||
|
||||
x = torch.log(x)
|
||||
x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
|
||||
|
||||
if self.flip_sin_to_cos:
|
||||
out = torch.cat([torch.cos(x_proj), torch.sin(x_proj)], dim=-1)
|
||||
else:
|
||||
out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
|
||||
out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
|
||||
return out
|
||||
|
||||
|
||||
class ImagePositionalEmbeddings(nn.Module):
|
||||
"""
|
||||
Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
|
||||
height and width of the latent space.
|
||||
|
||||
For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092
|
||||
|
||||
For VQ-diffusion:
|
||||
|
||||
Output vector embeddings are used as input for the transformer.
|
||||
|
||||
Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.
|
||||
|
||||
Args:
|
||||
num_embed (`int`):
|
||||
Number of embeddings for the latent pixels embeddings.
|
||||
height (`int`):
|
||||
Height of the latent image i.e. the number of height embeddings.
|
||||
width (`int`):
|
||||
Width of the latent image i.e. the number of width embeddings.
|
||||
embed_dim (`int`):
|
||||
Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_embed: int,
|
||||
height: int,
|
||||
width: int,
|
||||
embed_dim: int,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.height = height
|
||||
self.width = width
|
||||
self.num_embed = num_embed
|
||||
self.embed_dim = embed_dim
|
||||
|
||||
self.emb = nn.Embedding(self.num_embed, embed_dim)
|
||||
self.height_emb = nn.Embedding(self.height, embed_dim)
|
||||
self.width_emb = nn.Embedding(self.width, embed_dim)
|
||||
|
||||
def forward(self, index):
|
||||
emb = self.emb(index)
|
||||
|
||||
height_emb = self.height_emb(torch.arange(self.height, device=index.device).view(1, self.height))
|
||||
|
||||
# 1 x H x D -> 1 x H x 1 x D
|
||||
height_emb = height_emb.unsqueeze(2)
|
||||
|
||||
width_emb = self.width_emb(torch.arange(self.width, device=index.device).view(1, self.width))
|
||||
|
||||
# 1 x W x D -> 1 x 1 x W x D
|
||||
width_emb = width_emb.unsqueeze(1)
|
||||
|
||||
pos_emb = height_emb + width_emb
|
||||
|
||||
# 1 x H x W x D -> 1 x L xD
|
||||
pos_emb = pos_emb.view(1, self.height * self.width, -1)
|
||||
|
||||
emb = emb + pos_emb[:, : emb.shape[1], :]
|
||||
|
||||
return emb
|
||||
|
||||
@@ -17,41 +17,23 @@ import flax.linen as nn
|
||||
import jax.numpy as jnp
|
||||
|
||||
|
||||
def get_sinusoidal_embeddings(
|
||||
timesteps: jnp.ndarray,
|
||||
embedding_dim: int,
|
||||
freq_shift: float = 1,
|
||||
min_timescale: float = 1,
|
||||
max_timescale: float = 1.0e4,
|
||||
flip_sin_to_cos: bool = False,
|
||||
scale: float = 1.0,
|
||||
) -> jnp.ndarray:
|
||||
"""Returns the positional encoding (same as Tensor2Tensor).
|
||||
Args:
|
||||
timesteps: a 1-D Tensor of N indices, one per batch element.
|
||||
These may be fractional.
|
||||
embedding_dim: The number of output channels.
|
||||
min_timescale: The smallest time unit (should probably be 0.0).
|
||||
max_timescale: The largest time unit.
|
||||
Returns:
|
||||
a Tensor of timing signals [N, num_channels]
|
||||
# This is like models.embeddings.get_timestep_embedding (PyTorch) but
|
||||
# less general (only handles the case we currently need).
|
||||
def get_sinusoidal_embeddings(timesteps, embedding_dim, freq_shift: float = 1):
|
||||
"""
|
||||
assert timesteps.ndim == 1, "Timesteps should be a 1d-array"
|
||||
assert embedding_dim % 2 == 0, f"Embedding dimension {embedding_dim} should be even"
|
||||
num_timescales = float(embedding_dim // 2)
|
||||
log_timescale_increment = math.log(max_timescale / min_timescale) / (num_timescales - freq_shift)
|
||||
inv_timescales = min_timescale * jnp.exp(jnp.arange(num_timescales, dtype=jnp.float32) * -log_timescale_increment)
|
||||
emb = jnp.expand_dims(timesteps, 1) * jnp.expand_dims(inv_timescales, 0)
|
||||
This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
|
||||
|
||||
# scale embeddings
|
||||
scaled_time = scale * emb
|
||||
|
||||
if flip_sin_to_cos:
|
||||
signal = jnp.concatenate([jnp.cos(scaled_time), jnp.sin(scaled_time)], axis=1)
|
||||
else:
|
||||
signal = jnp.concatenate([jnp.sin(scaled_time), jnp.cos(scaled_time)], axis=1)
|
||||
signal = jnp.reshape(signal, [jnp.shape(timesteps)[0], embedding_dim])
|
||||
return signal
|
||||
:param timesteps: a 1-D tensor of N indices, one per batch element.
|
||||
These may be fractional.
|
||||
:param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
|
||||
embeddings. :return: an [N x dim] tensor of positional embeddings.
|
||||
"""
|
||||
half_dim = embedding_dim // 2
|
||||
emb = math.log(10000) / (half_dim - freq_shift)
|
||||
emb = jnp.exp(jnp.arange(half_dim) * -emb)
|
||||
emb = timesteps[:, None] * emb[None, :]
|
||||
emb = jnp.concatenate([jnp.cos(emb), jnp.sin(emb)], -1)
|
||||
return emb
|
||||
|
||||
|
||||
class FlaxTimestepEmbedding(nn.Module):
|
||||
@@ -88,4 +70,4 @@ class FlaxTimesteps(nn.Module):
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, timesteps):
|
||||
return get_sinusoidal_embeddings(timesteps, embedding_dim=self.dim, freq_shift=self.freq_shift)
|
||||
return get_sinusoidal_embeddings(timesteps, self.dim, freq_shift=self.freq_shift)
|
||||
|
||||
@@ -48,10 +48,6 @@ class Upsample2D(nn.Module):
|
||||
if dtype == torch.bfloat16:
|
||||
hidden_states = hidden_states.to(torch.float32)
|
||||
|
||||
# upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
|
||||
if hidden_states.shape[0] >= 64:
|
||||
hidden_states = hidden_states.contiguous()
|
||||
|
||||
# if `output_size` is passed we force the interpolation output
|
||||
# size and do not make use of `scale_factor=2`
|
||||
if output_size is None:
|
||||
@@ -380,10 +376,6 @@ class ResnetBlock2D(nn.Module):
|
||||
hidden_states = self.nonlinearity(hidden_states)
|
||||
|
||||
if self.upsample is not None:
|
||||
# upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
|
||||
if hidden_states.shape[0] >= 64:
|
||||
input_tensor = input_tensor.contiguous()
|
||||
hidden_states = hidden_states.contiguous()
|
||||
input_tensor = self.upsample(input_tensor)
|
||||
hidden_states = self.upsample(hidden_states)
|
||||
elif self.downsample is not None:
|
||||
@@ -500,6 +492,10 @@ def upfirdn2d_native(tensor, kernel, up=1, down=1, pad=(0, 0)):
|
||||
kernel_h, kernel_w = kernel.shape
|
||||
|
||||
out = tensor.view(-1, in_h, 1, in_w, 1, minor)
|
||||
|
||||
# Temporary workaround for mps specific issue: https://github.com/pytorch/pytorch/issues/84535
|
||||
if tensor.device.type == "mps":
|
||||
out = out.to("cpu")
|
||||
out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
|
||||
out = out.view(-1, in_h * up_y, in_w * up_x, minor)
|
||||
|
||||
|
||||
@@ -1,172 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from ..configuration_utils import ConfigMixin, register_to_config
|
||||
from ..modeling_utils import ModelMixin
|
||||
from ..utils import BaseOutput
|
||||
from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
|
||||
from .unet_1d_blocks import get_down_block, get_mid_block, get_up_block
|
||||
|
||||
|
||||
@dataclass
|
||||
class UNet1DOutput(BaseOutput):
|
||||
"""
|
||||
Args:
|
||||
sample (`torch.FloatTensor` of shape `(batch_size, num_channels, sample_size)`):
|
||||
Hidden states output. Output of last layer of model.
|
||||
"""
|
||||
|
||||
sample: torch.FloatTensor
|
||||
|
||||
|
||||
class UNet1DModel(ModelMixin, ConfigMixin):
|
||||
r"""
|
||||
UNet1DModel is a 1D UNet model that takes in a noisy sample and a timestep and returns sample shaped output.
|
||||
|
||||
This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
|
||||
implements for all the model (such as downloading or saving, etc.)
|
||||
|
||||
Parameters:
|
||||
sample_size (`int`, *optionl*): Default length of sample. Should be adaptable at runtime.
|
||||
in_channels (`int`, *optional*, defaults to 2): Number of channels in the input sample.
|
||||
out_channels (`int`, *optional*, defaults to 2): Number of channels in the output.
|
||||
time_embedding_type (`str`, *optional*, defaults to `"fourier"`): Type of time embedding to use.
|
||||
freq_shift (`int`, *optional*, defaults to 0): Frequency shift for fourier time embedding.
|
||||
flip_sin_to_cos (`bool`, *optional*, defaults to :
|
||||
obj:`False`): Whether to flip sin to cos for fourier time embedding.
|
||||
down_block_types (`Tuple[str]`, *optional*, defaults to :
|
||||
obj:`("DownBlock1D", "DownBlock1DNoSkip", "AttnDownBlock1D")`): Tuple of downsample block types.
|
||||
up_block_types (`Tuple[str]`, *optional*, defaults to :
|
||||
obj:`("UpBlock1D", "UpBlock1DNoSkip", "AttnUpBlock1D")`): Tuple of upsample block types.
|
||||
block_out_channels (`Tuple[int]`, *optional*, defaults to :
|
||||
obj:`(32, 32, 64)`): Tuple of block output channels.
|
||||
"""
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
sample_size: int = 65536,
|
||||
sample_rate: Optional[int] = None,
|
||||
in_channels: int = 2,
|
||||
out_channels: int = 2,
|
||||
extra_in_channels: int = 0,
|
||||
time_embedding_type: str = "fourier",
|
||||
freq_shift: int = 0,
|
||||
flip_sin_to_cos: bool = True,
|
||||
use_timestep_embedding: bool = False,
|
||||
down_block_types: Tuple[str] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
|
||||
mid_block_type: str = "UNetMidBlock1D",
|
||||
up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
|
||||
block_out_channels: Tuple[int] = (32, 32, 64),
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.sample_size = sample_size
|
||||
|
||||
# time
|
||||
if time_embedding_type == "fourier":
|
||||
self.time_proj = GaussianFourierProjection(
|
||||
embedding_size=8, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
|
||||
)
|
||||
timestep_input_dim = 2 * block_out_channels[0]
|
||||
elif time_embedding_type == "positional":
|
||||
self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
|
||||
timestep_input_dim = block_out_channels[0]
|
||||
|
||||
if use_timestep_embedding:
|
||||
time_embed_dim = block_out_channels[0] * 4
|
||||
self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
|
||||
|
||||
self.down_blocks = nn.ModuleList([])
|
||||
self.mid_block = None
|
||||
self.up_blocks = nn.ModuleList([])
|
||||
self.out_block = None
|
||||
|
||||
# down
|
||||
output_channel = in_channels
|
||||
for i, down_block_type in enumerate(down_block_types):
|
||||
input_channel = output_channel
|
||||
output_channel = block_out_channels[i]
|
||||
|
||||
if i == 0:
|
||||
input_channel += extra_in_channels
|
||||
|
||||
down_block = get_down_block(
|
||||
down_block_type,
|
||||
in_channels=input_channel,
|
||||
out_channels=output_channel,
|
||||
)
|
||||
self.down_blocks.append(down_block)
|
||||
|
||||
# mid
|
||||
self.mid_block = get_mid_block(
|
||||
mid_block_type=mid_block_type,
|
||||
mid_channels=block_out_channels[-1],
|
||||
in_channels=block_out_channels[-1],
|
||||
out_channels=None,
|
||||
)
|
||||
|
||||
# up
|
||||
reversed_block_out_channels = list(reversed(block_out_channels))
|
||||
output_channel = reversed_block_out_channels[0]
|
||||
for i, up_block_type in enumerate(up_block_types):
|
||||
prev_output_channel = output_channel
|
||||
output_channel = reversed_block_out_channels[i + 1] if i < len(up_block_types) - 1 else out_channels
|
||||
|
||||
up_block = get_up_block(
|
||||
up_block_type,
|
||||
in_channels=prev_output_channel,
|
||||
out_channels=output_channel,
|
||||
)
|
||||
self.up_blocks.append(up_block)
|
||||
prev_output_channel = output_channel
|
||||
|
||||
# TODO(PVP, Nathan) placeholder for RL application to be merged shortly
|
||||
# Totally fine to add another layer with a if statement - no need for nn.Identity here
|
||||
|
||||
def forward(
|
||||
self,
|
||||
sample: torch.FloatTensor,
|
||||
timestep: Union[torch.Tensor, float, int],
|
||||
return_dict: bool = True,
|
||||
) -> Union[UNet1DOutput, Tuple]:
|
||||
r"""
|
||||
Args:
|
||||
sample (`torch.FloatTensor`): `(batch_size, sample_size, num_channels)` noisy inputs tensor
|
||||
timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple.
|
||||
|
||||
Returns:
|
||||
[`~models.unet_1d.UNet1DOutput`] or `tuple`: [`~models.unet_1d.UNet1DOutput`] if `return_dict` is True,
|
||||
otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
|
||||
"""
|
||||
# 1. time
|
||||
if len(timestep.shape) == 0:
|
||||
timestep = timestep[None]
|
||||
|
||||
timestep_embed = self.time_proj(timestep)[..., None]
|
||||
timestep_embed = timestep_embed.repeat([1, 1, sample.shape[2]]).to(sample.dtype)
|
||||
|
||||
# 2. down
|
||||
down_block_res_samples = ()
|
||||
for downsample_block in self.down_blocks:
|
||||
sample, res_samples = downsample_block(hidden_states=sample, temb=timestep_embed)
|
||||
down_block_res_samples += res_samples
|
||||
|
||||
# 3. mid
|
||||
sample = self.mid_block(sample)
|
||||
|
||||
# 4. up
|
||||
for i, upsample_block in enumerate(self.up_blocks):
|
||||
res_samples = down_block_res_samples[-1:]
|
||||
down_block_res_samples = down_block_res_samples[:-1]
|
||||
sample = upsample_block(sample, res_samples)
|
||||
|
||||
if not return_dict:
|
||||
return (sample,)
|
||||
|
||||
return UNet1DOutput(sample=sample)
|
||||
@@ -1,384 +0,0 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
|
||||
_kernels = {
|
||||
"linear": [1 / 8, 3 / 8, 3 / 8, 1 / 8],
|
||||
"cubic": [-0.01171875, -0.03515625, 0.11328125, 0.43359375, 0.43359375, 0.11328125, -0.03515625, -0.01171875],
|
||||
"lanczos3": [
|
||||
0.003689131001010537,
|
||||
0.015056144446134567,
|
||||
-0.03399861603975296,
|
||||
-0.066637322306633,
|
||||
0.13550527393817902,
|
||||
0.44638532400131226,
|
||||
0.44638532400131226,
|
||||
0.13550527393817902,
|
||||
-0.066637322306633,
|
||||
-0.03399861603975296,
|
||||
0.015056144446134567,
|
||||
0.003689131001010537,
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
class Downsample1d(nn.Module):
|
||||
def __init__(self, kernel="linear", pad_mode="reflect"):
|
||||
super().__init__()
|
||||
self.pad_mode = pad_mode
|
||||
kernel_1d = torch.tensor(_kernels[kernel])
|
||||
self.pad = kernel_1d.shape[0] // 2 - 1
|
||||
self.register_buffer("kernel", kernel_1d)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode)
|
||||
weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
|
||||
indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
|
||||
weight[indices, indices] = self.kernel.to(weight)
|
||||
return F.conv1d(hidden_states, weight, stride=2)
|
||||
|
||||
|
||||
class Upsample1d(nn.Module):
|
||||
def __init__(self, kernel="linear", pad_mode="reflect"):
|
||||
super().__init__()
|
||||
self.pad_mode = pad_mode
|
||||
kernel_1d = torch.tensor(_kernels[kernel]) * 2
|
||||
self.pad = kernel_1d.shape[0] // 2 - 1
|
||||
self.register_buffer("kernel", kernel_1d)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode)
|
||||
weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
|
||||
indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
|
||||
weight[indices, indices] = self.kernel.to(weight)
|
||||
return F.conv_transpose1d(hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
|
||||
|
||||
|
||||
class SelfAttention1d(nn.Module):
|
||||
def __init__(self, in_channels, n_head=1, dropout_rate=0.0):
|
||||
super().__init__()
|
||||
self.channels = in_channels
|
||||
self.group_norm = nn.GroupNorm(1, num_channels=in_channels)
|
||||
self.num_heads = n_head
|
||||
|
||||
self.query = nn.Linear(self.channels, self.channels)
|
||||
self.key = nn.Linear(self.channels, self.channels)
|
||||
self.value = nn.Linear(self.channels, self.channels)
|
||||
|
||||
self.proj_attn = nn.Linear(self.channels, self.channels, 1)
|
||||
|
||||
self.dropout = nn.Dropout(dropout_rate, inplace=True)
|
||||
|
||||
def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
|
||||
new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
|
||||
# move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
|
||||
new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
|
||||
return new_projection
|
||||
|
||||
def forward(self, hidden_states):
|
||||
residual = hidden_states
|
||||
batch, channel_dim, seq = hidden_states.shape
|
||||
|
||||
hidden_states = self.group_norm(hidden_states)
|
||||
hidden_states = hidden_states.transpose(1, 2)
|
||||
|
||||
query_proj = self.query(hidden_states)
|
||||
key_proj = self.key(hidden_states)
|
||||
value_proj = self.value(hidden_states)
|
||||
|
||||
query_states = self.transpose_for_scores(query_proj)
|
||||
key_states = self.transpose_for_scores(key_proj)
|
||||
value_states = self.transpose_for_scores(value_proj)
|
||||
|
||||
scale = 1 / math.sqrt(math.sqrt(key_states.shape[-1]))
|
||||
|
||||
attention_scores = torch.matmul(query_states * scale, key_states.transpose(-1, -2) * scale)
|
||||
attention_probs = torch.softmax(attention_scores, dim=-1)
|
||||
|
||||
# compute attention output
|
||||
hidden_states = torch.matmul(attention_probs, value_states)
|
||||
|
||||
hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
|
||||
new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
|
||||
hidden_states = hidden_states.view(new_hidden_states_shape)
|
||||
|
||||
# compute next hidden_states
|
||||
hidden_states = self.proj_attn(hidden_states)
|
||||
hidden_states = hidden_states.transpose(1, 2)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
|
||||
output = hidden_states + residual
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class ResConvBlock(nn.Module):
|
||||
def __init__(self, in_channels, mid_channels, out_channels, is_last=False):
|
||||
super().__init__()
|
||||
self.is_last = is_last
|
||||
self.has_conv_skip = in_channels != out_channels
|
||||
|
||||
if self.has_conv_skip:
|
||||
self.conv_skip = nn.Conv1d(in_channels, out_channels, 1, bias=False)
|
||||
|
||||
self.conv_1 = nn.Conv1d(in_channels, mid_channels, 5, padding=2)
|
||||
self.group_norm_1 = nn.GroupNorm(1, mid_channels)
|
||||
self.gelu_1 = nn.GELU()
|
||||
self.conv_2 = nn.Conv1d(mid_channels, out_channels, 5, padding=2)
|
||||
|
||||
if not self.is_last:
|
||||
self.group_norm_2 = nn.GroupNorm(1, out_channels)
|
||||
self.gelu_2 = nn.GELU()
|
||||
|
||||
def forward(self, hidden_states):
|
||||
residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states
|
||||
|
||||
hidden_states = self.conv_1(hidden_states)
|
||||
hidden_states = self.group_norm_1(hidden_states)
|
||||
hidden_states = self.gelu_1(hidden_states)
|
||||
hidden_states = self.conv_2(hidden_states)
|
||||
|
||||
if not self.is_last:
|
||||
hidden_states = self.group_norm_2(hidden_states)
|
||||
hidden_states = self.gelu_2(hidden_states)
|
||||
|
||||
output = hidden_states + residual
|
||||
return output
|
||||
|
||||
|
||||
def get_down_block(down_block_type, out_channels, in_channels):
|
||||
if down_block_type == "DownBlock1D":
|
||||
return DownBlock1D(out_channels=out_channels, in_channels=in_channels)
|
||||
elif down_block_type == "AttnDownBlock1D":
|
||||
return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels)
|
||||
elif down_block_type == "DownBlock1DNoSkip":
|
||||
return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels)
|
||||
raise ValueError(f"{down_block_type} does not exist.")
|
||||
|
||||
|
||||
def get_up_block(up_block_type, in_channels, out_channels):
|
||||
if up_block_type == "UpBlock1D":
|
||||
return UpBlock1D(in_channels=in_channels, out_channels=out_channels)
|
||||
elif up_block_type == "AttnUpBlock1D":
|
||||
return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels)
|
||||
elif up_block_type == "UpBlock1DNoSkip":
|
||||
return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels)
|
||||
raise ValueError(f"{up_block_type} does not exist.")
|
||||
|
||||
|
||||
def get_mid_block(mid_block_type, in_channels, mid_channels, out_channels):
|
||||
if mid_block_type == "UNetMidBlock1D":
|
||||
return UNetMidBlock1D(in_channels=in_channels, mid_channels=mid_channels, out_channels=out_channels)
|
||||
raise ValueError(f"{mid_block_type} does not exist.")
|
||||
|
||||
|
||||
class UNetMidBlock1D(nn.Module):
|
||||
def __init__(self, mid_channels, in_channels, out_channels=None):
|
||||
super().__init__()
|
||||
|
||||
out_channels = in_channels if out_channels is None else out_channels
|
||||
|
||||
# there is always at least one resnet
|
||||
self.down = Downsample1d("cubic")
|
||||
resnets = [
|
||||
ResConvBlock(in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels),
|
||||
]
|
||||
attentions = [
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(out_channels, out_channels // 32),
|
||||
]
|
||||
self.up = Upsample1d(kernel="cubic")
|
||||
|
||||
self.attentions = nn.ModuleList(attentions)
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.down(hidden_states)
|
||||
for attn, resnet in zip(self.attentions, self.resnets):
|
||||
hidden_states = resnet(hidden_states)
|
||||
hidden_states = attn(hidden_states)
|
||||
|
||||
hidden_states = self.up(hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class AttnDownBlock1D(nn.Module):
|
||||
def __init__(self, out_channels, in_channels, mid_channels=None):
|
||||
super().__init__()
|
||||
mid_channels = out_channels if mid_channels is None else mid_channels
|
||||
|
||||
self.down = Downsample1d("cubic")
|
||||
resnets = [
|
||||
ResConvBlock(in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels),
|
||||
]
|
||||
attentions = [
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(out_channels, out_channels // 32),
|
||||
]
|
||||
|
||||
self.attentions = nn.ModuleList(attentions)
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
|
||||
def forward(self, hidden_states, temb=None):
|
||||
hidden_states = self.down(hidden_states)
|
||||
|
||||
for resnet, attn in zip(self.resnets, self.attentions):
|
||||
hidden_states = resnet(hidden_states)
|
||||
hidden_states = attn(hidden_states)
|
||||
|
||||
return hidden_states, (hidden_states,)
|
||||
|
||||
|
||||
class DownBlock1D(nn.Module):
|
||||
def __init__(self, out_channels, in_channels, mid_channels=None):
|
||||
super().__init__()
|
||||
mid_channels = out_channels if mid_channels is None else mid_channels
|
||||
|
||||
self.down = Downsample1d("cubic")
|
||||
resnets = [
|
||||
ResConvBlock(in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels),
|
||||
]
|
||||
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
|
||||
def forward(self, hidden_states, temb=None):
|
||||
hidden_states = self.down(hidden_states)
|
||||
|
||||
for resnet in self.resnets:
|
||||
hidden_states = resnet(hidden_states)
|
||||
|
||||
return hidden_states, (hidden_states,)
|
||||
|
||||
|
||||
class DownBlock1DNoSkip(nn.Module):
|
||||
def __init__(self, out_channels, in_channels, mid_channels=None):
|
||||
super().__init__()
|
||||
mid_channels = out_channels if mid_channels is None else mid_channels
|
||||
|
||||
resnets = [
|
||||
ResConvBlock(in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels),
|
||||
]
|
||||
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
|
||||
def forward(self, hidden_states, temb=None):
|
||||
hidden_states = torch.cat([hidden_states, temb], dim=1)
|
||||
for resnet in self.resnets:
|
||||
hidden_states = resnet(hidden_states)
|
||||
|
||||
return hidden_states, (hidden_states,)
|
||||
|
||||
|
||||
class AttnUpBlock1D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, mid_channels=None):
|
||||
super().__init__()
|
||||
mid_channels = out_channels if mid_channels is None else mid_channels
|
||||
|
||||
resnets = [
|
||||
ResConvBlock(2 * in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels),
|
||||
]
|
||||
attentions = [
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(out_channels, out_channels // 32),
|
||||
]
|
||||
|
||||
self.attentions = nn.ModuleList(attentions)
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
self.up = Upsample1d(kernel="cubic")
|
||||
|
||||
def forward(self, hidden_states, res_hidden_states_tuple):
|
||||
res_hidden_states = res_hidden_states_tuple[-1]
|
||||
hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
|
||||
|
||||
for resnet, attn in zip(self.resnets, self.attentions):
|
||||
hidden_states = resnet(hidden_states)
|
||||
hidden_states = attn(hidden_states)
|
||||
|
||||
hidden_states = self.up(hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class UpBlock1D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, mid_channels=None):
|
||||
super().__init__()
|
||||
mid_channels = in_channels if mid_channels is None else mid_channels
|
||||
|
||||
resnets = [
|
||||
ResConvBlock(2 * in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels),
|
||||
]
|
||||
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
self.up = Upsample1d(kernel="cubic")
|
||||
|
||||
def forward(self, hidden_states, res_hidden_states_tuple):
|
||||
res_hidden_states = res_hidden_states_tuple[-1]
|
||||
hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
|
||||
|
||||
for resnet in self.resnets:
|
||||
hidden_states = resnet(hidden_states)
|
||||
|
||||
hidden_states = self.up(hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class UpBlock1DNoSkip(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, mid_channels=None):
|
||||
super().__init__()
|
||||
mid_channels = in_channels if mid_channels is None else mid_channels
|
||||
|
||||
resnets = [
|
||||
ResConvBlock(2 * in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True),
|
||||
]
|
||||
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
|
||||
def forward(self, hidden_states, res_hidden_states_tuple):
|
||||
res_hidden_states = res_hidden_states_tuple[-1]
|
||||
hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
|
||||
|
||||
for resnet in self.resnets:
|
||||
hidden_states = resnet(hidden_states)
|
||||
|
||||
return hidden_states
|
||||
@@ -21,7 +21,7 @@ from ..configuration_utils import ConfigMixin, register_to_config
|
||||
from ..modeling_utils import ModelMixin
|
||||
from ..utils import BaseOutput
|
||||
from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
|
||||
from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
|
||||
from .unet_blocks import UNetMidBlock2D, get_down_block, get_up_block
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -22,7 +22,7 @@ from ..configuration_utils import ConfigMixin, register_to_config
|
||||
from ..modeling_utils import ModelMixin
|
||||
from ..utils import BaseOutput, logging
|
||||
from .embeddings import TimestepEmbedding, Timesteps
|
||||
from .unet_2d_blocks import (
|
||||
from .unet_blocks import (
|
||||
CrossAttnDownBlock2D,
|
||||
CrossAttnUpBlock2D,
|
||||
DownBlock2D,
|
||||
@@ -225,17 +225,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin):
|
||||
if hasattr(block, "attentions") and block.attentions is not None:
|
||||
block.set_attention_slice(slice_size)
|
||||
|
||||
def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for block in self.down_blocks:
|
||||
if hasattr(block, "attentions") and block.attentions is not None:
|
||||
block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
self.mid_block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
for block in self.up_blocks:
|
||||
if hasattr(block, "attentions") and block.attentions is not None:
|
||||
block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
def _set_gradient_checkpointing(self, module, value=False):
|
||||
if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user