Spaces:

tedlasai
/

learn2refocus

Running on Zero

App Files Files Community

tedlasai commited on 1 day ago

Commit

199f9c2

1 Parent(s): 5aaa283

commit

Browse files

Files changed (21) hide show

LICENSE-CODE +21 -0
app.py +115 -4
extra/compute_metrics.py +136 -0
extra/download_dataset.py +0 -0
setup/checkpoints_to_hf.py +12 -0
setup/download_checkpoints.py +35 -0
setup/download_svd_weights.py +13 -0
setup/environment.yaml +225 -0
simplified_inference.py +190 -0
simplified_pipeline.py +807 -0
simplified_validation.py +108 -0
splits/test_scenes.pkl +3 -0
splits/train_scenes.pkl +3 -0
training/configs/accelerator_config.yaml +25 -0
training/configs/focal_stacks_test.yaml +47 -0
training/configs/focal_stacks_train.yaml +47 -0
training/configs/outside_photos.yaml +46 -0
training/svd_pipeline.py +828 -0
training/svd_runner.py +683 -0
training/utils.py +509 -0
training/validation.py +145 -0

LICENSE-CODE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Stability AI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py CHANGED Viewed

@@ -1,7 +1,118 @@
 import gradio as gr
-def greet(name):
-    return "hi " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import os
+import uuid
+from pathlib import Path
+import argparse
 import gradio as gr
+from PIL import Image
+from diffusers.utils import export_to_video
+from inference import load_model, inference_on_image
+# -----------------------
+# 1. Load model
+# -----------------------
+args = argparse.Namespace()
+args.blur2vid_hf_repo_path = "tedlasai/blur2vid"
+args.pretrained_model_path = "THUDM/CogVideoX-2b"
+args.model_config_path = "training/configs/outsidephotos.yaml"
+args.video_width = 1280
+args.video_height = 720
+args.seed = None
+pipe, model_config = load_model(args)
+OUTPUT_DIR = Path("/tmp/generated_videos")
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+def generate_video_from_image(image: Image.Image, interval_key: str, num_inference_steps: int) -> str:
+    """
+    Wrapper for Gradio. Takes an image and returns a video path.
+    """
+    if image is None:
+        raise gr.Error("Please upload an image first.")
+    print("Generating video")
+    import torch
+    print("CUDA:", torch.cuda.is_available())
+    print("Device:", torch.cuda.get_device_name(0))
+    print("bf16 supported:", torch.cuda.is_bf16_supported())
+    args.num_inference_steps = num_inference_steps
+    video_id = uuid.uuid4().hex
+    output_path = OUTPUT_DIR / f"{video_id}.mp4"
+    args.device = "cuda"
+    pipe.to(args.device)
+    processed_image, video = inference_on_image(pipe, image, interval_key, model_config, args)
+    export_to_video(video, output_path, fps=20)
+    if not os.path.exists(output_path):
+        raise gr.Error("Video generation failed: output file not found.")
+    return str(output_path)
+with gr.Blocks(css="footer {visibility: hidden}") as demo:
+    gr.Markdown(
+        """
+        # 🖼️ ➜ 🎬 Recover Motion from a Blurry Image
+        This demo accompanies the paper **“Generating the Past, Present, and Future from a Motion-Blurred Image”**
+        by Tedla *et al.*, ACM Transactions on Graphics (SIGGRAPH Asia 2025).
+        - 🌐 **Project page:** <https://blur2vid.github.io/>
+        - 💻 **Code:** <https://github.com/tedlasai/blur2vid/>
+        Upload a blurry image and the model will generate a short video showing the recovered motion based on your selection.
+        Note: The image will be resized to 1280×720. We recommend uploading landscape-oriented images.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            image_in = gr.Image(
+                type="pil",
+                label="Input image",
+                interactive=True,
+            )
+            with gr.Row():
+                tense_choice = gr.Radio(
+                    label="Select the interval to be generated:",
+                    choices=["present", "past, present and future"],
+                    value="past, present and future",
+                    interactive=True,
+                )
+            num_inference_steps = gr.Slider(
+                label="Number of inference steps",
+                minimum=4,
+                maximum=50,
+                step=1,
+                value=20,
+                info="More steps = better quality but slower",
+            )
+            generate_btn = gr.Button("Generate video", variant="primary")
+        with gr.Column():
+            video_out = gr.Video(
+                label="Generated video",
+                format="mp4",
+                autoplay=True,
+                loop=True,
+            )
+    generate_btn.click(
+        fn=generate_video_from_image,
+        inputs=[image_in, tense_choice, num_inference_steps],
+        outputs=video_out,
+        api_name="predict",
+    )
+if __name__ == "__main__":
+    demo.launch()

extra/compute_metrics.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import torchmetrics
+import os
+import torch
+from PIL import Image
+import numpy as np
+import csv
+import sys
+num_positions = 9
+output_dir_path = "/datasets/sai/focal-burst-learning/metrics_output"
+gt = "gt"
+model = sys.argv[1]
+gt_path = os.path.join(output_dir_path, gt)
+model_path = os.path.join(output_dir_path, model)
+device = sys.argv[2]
+metrics_grid = []
+for i in range(num_positions):
+    row = []
+    for j in range(num_positions):
+        metrics = {
+            "psnr": torchmetrics.image.PeakSignalNoiseRatio(data_range=1.0).to(device),
+            "ssim": torchmetrics.image.StructuralSimilarityIndexMeasure().to(device),
+            "lpips": torchmetrics.image.lpip.LearnedPerceptualImagePatchSimilarity(net_type='vgg', normalize=True).to(device),
+            "fid": torchmetrics.image.fid.FrechetInceptionDistance(normalize=True).to(device),
+            "vif": torchmetrics.image.VisualInformationFidelity().to(device),
+        }
+        row.append(metrics)
+    metrics_grid.append(row)
+    print("Created metrics for position", i)
+#lopp through each directory in gt_path
+#get all directories in gt_path
+position_dirs = os.listdir(gt_path)
+position_dirs = sorted([dir for dir in position_dirs if os.path.isdir(os.path.join(gt_path, dir))]) [0:num_positions]
+for gt_dir in position_dirs:
+    position_number = int(gt_dir.split("_")[1])
+    #get pngs inside that directory
+    gt_pngs = sorted(os.listdir(os.path.join(gt_path, gt_dir, "images")))
+    #Confirm that number of pngs == 164*9
+    assert len(gt_pngs) == 164*9
+    #loop through the 164 imgs
+    for i in range(164):
+        #get the 9 frames
+        gt_frames_names = gt_pngs[i*9:(i+1)*9]
+        #load the 9 frames
+        gt_frames = [Image.open(os.path.join(gt_path, gt_dir, "images", frame)) for frame in gt_frames_names]
+        #make into numpy arraymo
+        gt_frames = [torch.tensor(np.array(frame)/255).to(torch.float32).to(device).permute(2,0,1).unsqueeze(0) for frame in gt_frames]
+        #load model_frames which is almost smae path but in model_path
+        model_frames = [Image.open(os.path.join(model_path, gt_dir, "images", frame)) for frame in gt_frames_names]
+        #make into numpy array
+        model_frames = [torch.tensor(np.array(frame)/255).to(torch.float32).to(device).permute(2,0,1).unsqueeze(0) for frame in model_frames]
+        #loop through the 9 frames
+        for j in range(num_positions):
+            #compute metrics
+            for key, metric in metrics_grid[position_number][j].items():
+                #if frames have a 4th channel discard it
+                if gt_frames[j].shape[1] == 4:
+                    gt_frames[j] = gt_frames[j][:,:3,:,:]
+                if model_frames[j].shape[1] == 4:
+                    model_frames[j] = model_frames[j][:,:3,:,:]
+                if key == "fid":
+                    metric.update(model_frames[j], real=False)
+                    metric.update(gt_frames[j], real=True)
+                else:
+                    metric(gt_frames[j], model_frames[j])
+        print("Computed metrics for position", position_number, "frame", i)
+#write the metrics to a csv (each metric as a csv)
+def write_metrics_to_csv(metrics_grid, metric_names, formatting_options=None, output_dir="metrics_output"):
+    """
+    Writes each metric in the metrics_grid to a separate CSV file.
+    Args:
+        metrics_grid (list): A 9x9 list of dictionaries containing metrics.
+        metric_names (list): List of metric names (e.g., ["psnr", "lpips", "fid"]).
+        output_dir (str): Directory where the CSV files will be saved.
+    """
+    import os
+    os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist
+    positions = list(range(1, num_positions+1))
+    for metric_name in metric_names:
+        output_file = os.path.join(output_dir, f"{metric_name}.csv")
+        # Get the formatting function for the current metric, or use default
+        format_fn = formatting_options.get(metric_name, lambda x: f"{x}") if formatting_options else lambda x: f"{x}"
+        # Write the metric to the CSV
+        with open(output_file, mode='w', newline='') as csv_file:
+            writer = csv.writer(csv_file)
+            header = ["Starting Position/End Position"] + [f"Position {i}" for i in positions]
+            writer.writerow(header)
+            # Iterate over the grid and extract the metric values
+            for i, row in enumerate(metrics_grid):
+                csv_row = [f"Position {positions[i]}"]  # Add the column label as the first column
+                for cell in row:
+                    metric = cell[metric_name]
+                    # Assuming metrics are PyTorch objects with a `compute` method
+                    # Replace `0.0` with metric.compute() if metric values are computed
+                    value = 0.0 if not hasattr(metric, "compute") else metric.compute().item()
+                    csv_row.append(format_fn(value))  # Format the value
+                writer.writerow(csv_row)
+                print(f"Wrote row for position {positions[i]} with metric {metric_name}")
+        print(f"Saved {metric_name} metrics to {output_file}")
+formatting_options = {
+    "psnr": lambda x: f"{x:.2f}",  # Two decimal places
+    "lpips": lambda x: f"{x:.4f}",  # Four decimal places
+    "fid": lambda x: f"{x:.2f}",  # Two  decimal places
+    "ssim": lambda x: f"{x:.4f}",  # Four decimal places
+    "vif": lambda x: f"{x:.4f}"  # Four decimal places
+}
+write_metrics_to_csv(metrics_grid, ["psnr", "ssim", "lpips", "fid", "vif"], formatting_options=formatting_options, output_dir=f"{output_dir_path}/metrics_output/{model}")

extra/download_dataset.py ADDED Viewed

File without changes

setup/checkpoints_to_hf.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from huggingface_hub import HfApi
+import os
+#run with HF_TOKEN = your_hf_token before python_command
+api = HfApi(token=os.getenv("HF_TOKEN"))
+folders = ["/datasets/sai/focal-burst-learning/svd/checkpoints/checkpoint-200000"]
+for folder in folders:
+    api.upload_folder(
+        folder_path=folder,
+        repo_id="tedlasai/learn2refocus",
+        repo_type="model",
+        path_in_repo=os.path.basename(folder)
+    )

setup/download_checkpoints.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from huggingface_hub import snapshot_download
+import os
+import sys
+# Make sure HF_TOKEN is set in your env beforehand:
+# export HF_TOKEN=your_hf_token
+#get first command line argument
+mode = sys.argv[1] if len(sys.argv) > 1 else "outsidephotos"
+REPO_ID = "tedlasai/learn2refocus"
+REPO_TYPE = "model"
+checkpoints = [
+    "checkpoint-200000",
+]
+# This is the root local directory where you want everything saved
+#get path of this file
+LOCAL_TRAINING_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "checkpoints")
+os.makedirs(LOCAL_TRAINING_ROOT, exist_ok=True)
+# Download only those folders from the repo and place them under LOCAL_TRAINING_ROOT
+snapshot_download(
+    repo_id=REPO_ID,
+    repo_type=REPO_TYPE,
+    local_dir=LOCAL_TRAINING_ROOT,
+    local_dir_use_symlinks=False,
+    allow_patterns=[f"{name}/*" for name in checkpoints],
+    token=os.getenv("HF_TOKEN"),
+)
+print(f"Done! Checkpoints downloaded under: {LOCAL_TRAINING_ROOT}")

setup/download_svd_weights.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from huggingface_hub import snapshot_download
+save_dir = "./svdh"
+# 1. Download the full model repo (weights + config + assets)
+local_dir = snapshot_download(
+    repo_id="stabilityai/stable-video-diffusion-img2vid",
+    revision="main",
+    local_dir=save_dir,
+    local_dir_use_symlinks=False  # ensures files are fully copied, not symlinked
+)
+print(f"Model downloaded to: {local_dir}")

setup/environment.yaml ADDED Viewed

	@@ -0,0 +1,225 @@

+name: refocus
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - asttokens=3.0.0=pyhd8ed1ab_1
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2025.4.26=hbd8a1cb_0
+  - comm=0.2.2=pyhd8ed1ab_1
+  - debugpy=1.6.0=py310hd8f1fbe_0
+  - entrypoints=0.4=pyhd8ed1ab_1
+  - exceptiongroup=1.2.2=pyhd8ed1ab_1
+  - executing=2.2.0=pyhd8ed1ab_0
+  - ffmpeg=4.3.2=hca11adc_0
+  - freetype=2.10.4=h0708190_1
+  - gmp=6.2.1=h58526e2_0
+  - gnutls=3.6.13=h85f3911_1
+  - ipykernel=6.20.2=pyh210e3f2_0
+  - ipython=8.36.0=pyh907856f_0
+  - jedi=0.19.2=pyhd8ed1ab_1
+  - jupyter_client=7.3.4=pyhd8ed1ab_0
+  - jupyter_core=5.7.2=pyh31011fe_1
+  - lame=3.100=h7f98852_1001
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - libevent=2.1.12=hdbd6064_1
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libpng=1.6.37=h21135ba_2
+  - libsodium=1.0.18=h36c2ea0_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - matplotlib-inline=0.1.7=pyhd8ed1ab_1
+  - ncurses=6.4=h6a678d5_0
+  - nest-asyncio=1.6.0=pyhd8ed1ab_1
+  - nettle=3.6=he412f7d_0
+  - openh264=2.1.1=h780b84a_0
+  - openssl=3.0.16=h5eee18b_0
+  - parso=0.8.4=pyhd8ed1ab_1
+  - pexpect=4.9.0=pyhd8ed1ab_1
+  - pickleshare=0.7.5=pyhd8ed1ab_1004
+  - pip=25.0=py310h06a4308_0
+  - platformdirs=4.3.7=pyh29332c3_0
+  - prompt-toolkit=3.0.51=pyha770c72_0
+  - ptyprocess=0.7.0=pyhd8ed1ab_1
+  - pure_eval=0.2.3=pyhd8ed1ab_1
+  - pygments=2.19.1=pyhd8ed1ab_0
+  - python=3.10.16=he870216_1
+  - python-dateutil=2.9.0.post0=pyhff2d567_1
+  - python_abi=3.10=2_cp310
+  - pyzmq=23.0.0=py310h330234f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=75.8.0=py310h06a4308_0
+  - six=1.17.0=pyhd8ed1ab_0
+  - sqlite=3.45.3=h5eee18b_0
+  - stack_data=0.6.3=pyhd8ed1ab_1
+  - tk=8.6.14=h39e8969_0
+  - tmux=3.3a=h5eee18b_1
+  - tornado=6.1=py310h5764c6d_3
+  - traitlets=5.14.3=pyhd8ed1ab_1
+  - typing_extensions=4.13.2=pyh29332c3_0
+  - wcwidth=0.2.13=pyhd8ed1ab_1
+  - wheel=0.45.1=py310h06a4308_0
+  - x264=1!161.3030=h7f98852_1
+  - xz=5.6.4=h5eee18b_1
+  - zeromq=4.3.4=h9c3ff4c_1
+  - zlib=1.2.13=h5eee18b_1
+  - pip:
+      - absl-py==2.2.0
+      - accelerate==1.5.2
+      - aiofiles==23.2.1
+      - aiohappyeyeballs==2.6.1
+      - aiohttp==3.12.14
+      - aiosignal==1.4.0
+      - annotated-types==0.7.0
+      - anyio==4.9.0
+      - async-timeout==5.0.1
+      - atomicwrites==1.4.1
+      - attrs==25.3.0
+      - beautifulsoup4==4.13.4
+      - certifi==2025.1.31
+      - cffi==1.17.1
+      - charset-normalizer==3.4.1
+      - click==8.1.8
+      - colour-science==0.4.6
+      - contourpy==1.3.1
+      - controlnet-aux==0.0.9
+      - cycler==0.12.1
+      - decorator==4.4.2
+      - decord==0.6.0
+      - denku==0.0.51
+      - diffusers==0.32.0
+      - distro==1.9.0
+      - docker-pycreds==0.4.0
+      - einops==0.8.1
+      - einops-exts==0.0.4
+      - fastapi==0.115.11
+      - ffmpeg-python==0.2.0
+      - ffmpy==0.5.0
+      - filelock==3.18.0
+      - flatbuffers==25.2.10
+      - fonttools==4.56.0
+      - frozenlist==1.7.0
+      - fsspec==2025.3.0
+      - future==1.0.0
+      - gdown==5.2.0
+      - gitdb==4.0.12
+      - gitpython==3.1.44
+      - gradio==5.22.0
+      - gradio-client==1.8.0
+      - groovy==0.1.2
+      - h11==0.14.0
+      - hf-transfer==0.1.9
+      - httpcore==1.0.7
+      - httpx==0.28.1
+      - huggingface-hub==0.29.3
+      - idna==3.10
+      - imageio==2.37.0
+      - imageio-ffmpeg==0.6.0
+      - importlib-metadata==8.6.1
+      - jax==0.5.3
+      - jaxlib==0.5.3
+      - jinja2==3.1.6
+      - jiter==0.9.0
+      - kiwisolver==1.4.8
+      - lazy-loader==0.4
+      - lightning==2.5.2
+      - lightning-utilities==0.14.3
+      - markdown-it-py==3.0.0
+      - markupsafe==3.0.2
+      - matplotlib==3.10.1
+      - mdurl==0.1.2
+      - mediapipe==0.10.21
+      - ml-dtypes==0.5.1
+      - moviepy==1.0.3
+      - mpmath==1.3.0
+      - multidict==6.6.3
+      - networkx==3.4.2
+      - numpy==1.26.0
+      - nvidia-cublas-cu12==12.4.5.8
+      - nvidia-cuda-cupti-cu12==12.4.127
+      - nvidia-cuda-nvrtc-cu12==12.4.127
+      - nvidia-cuda-runtime-cu12==12.4.127
+      - nvidia-cudnn-cu12==9.1.0.70
+      - nvidia-cufft-cu12==11.2.1.3
+      - nvidia-curand-cu12==10.3.5.147
+      - nvidia-cusolver-cu12==11.6.1.9
+      - nvidia-cusparse-cu12==12.3.1.170
+      - nvidia-cusparselt-cu12==0.6.2
+      - nvidia-ml-py==12.570.86
+      - nvidia-nccl-cu12==2.21.5
+      - nvidia-nvjitlink-cu12==12.4.127
+      - nvidia-nvtx-cu12==12.4.127
+      - nvitop==1.4.2
+      - openai==1.68.2
+      - opencv-contrib-python==4.11.0.86
+      - opencv-python==4.11.0.86
+      - opencv-python-headless==4.11.0.86
+      - opt-einsum==3.4.0
+      - orjson==3.10.15
+      - packaging==24.2
+      - pandas==2.2.3
+      - peft==0.15.0
+      - pillow==9.5.0
+      - proglog==0.1.10
+      - propcache==0.3.2
+      - protobuf==4.25.6
+      - psutil==5.9.8
+      - ptflops==0.7.4
+      - pycparser==2.22
+      - pydantic==2.10.6
+      - pydantic-core==2.27.2
+      - pydub==0.25.1
+      - pyparsing==3.2.1
+      - pysocks==1.7.1
+      - python-dotenv==1.0.1
+      - python-multipart==0.0.20
+      - pytorch-lightning==2.5.2
+      - pytz==2025.1
+      - pyyaml==6.0.2
+      - regex==2024.11.6
+      - requests==2.32.3
+      - rich==13.9.4
+      - ruff==0.11.2
+      - safehttpx==0.1.6
+      - safetensors==0.5.3
+      - scikit-image==0.24.0
+      - scikit-video==1.1.11
+      - scipy==1.15.2
+      - semantic-version==2.10.0
+      - sentencepiece==0.2.0
+      - sentry-sdk==2.24.0
+      - setproctitle==1.3.5
+      - shellingham==1.5.4
+      - smmap==5.0.2
+      - sniffio==1.3.1
+      - sounddevice==0.5.1
+      - soupsieve==2.7
+      - spaces==0.32.0
+      - spandrel==0.4.1
+      - starlette==0.46.1
+      - sympy==1.13.1
+      - tifffile==2025.3.13
+      - timm==0.6.7
+      - tokenizers==0.21.1
+      - tomlkit==0.13.2
+      - torch==2.6.0
+      - torch-fidelity==0.3.0
+      - torchmetrics==1.7.4
+      - torchvision==0.21.0
+      - tqdm==4.67.1
+      - transformers==4.50.0
+      - triton==3.2.0
+      - typer==0.15.2
+      - typing-extensions==4.12.2
+      - tzdata==2025.1
+      - urllib3==2.3.0
+      - uvicorn==0.34.0
+      - videoio==0.3.0
+      - wandb==0.19.8
+      - websockets==15.0.1
+      - yarl==1.20.1
+      - zipp==3.21.0

simplified_inference.py ADDED Viewed

	@@ -0,0 +1,190 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Script to fine-tune Stable Video Diffusion."""
+import math
+import os
+from torch.utils.data import Dataset
+import accelerate
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from packaging import version
+from tqdm.auto import tqdm
+from transformers import CLIPVisionModelWithProjection
+from simplified_validation import valid_net
+from diffusers import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
+from diffusers.utils import check_min_version
+import argparse
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.24.0.dev0")
+logger = get_logger(__name__, log_level="INFO")
+import numpy as np
+import torch
+import os
+import glob
+def parse_args():
+    parser = argparse.ArgumentParser(description="SVD Training Script")
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="/datasets/sai/focal-burst-learning/svd/training/configs/outside_photos.yaml",
+        help="Path to the config file.",
+    )
+    #seed should be int that default 0 (optional)
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        required=True,
+        help="Path to image input or directory containing input images",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="A seed for reproducible training.",
+    )
+    parser.add_argument(
+        "--learn2refocus_hf_repo_path",
+        type=str,
+        default="tedlasai/learn2refocus",
+        help="hf repo containing the weight files",
+    )
+    parser.add_argument(
+        "--pretrained_model_path",
+        type=str,
+        default="stabilityai/stable-video-diffusion-img2vid",
+        help="repo id or path for pretrained StableVideo Diffusion model",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="outputs/simple_inference",
+        help="path to output",
+    )
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=25,
+        help="number of DDPM steps",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="inference device",
+    )
+    args = parser.parse_args()
+    return args
+def find_scale(height, width):
+    max_pixels = 500000
+    # Start with no scaling
+    scale = 1.0
+    while True:
+        # Calculate the scaled dimensions
+        scaled_height = math.floor((height * scale) / 64) * 64
+        scaled_width = math.floor((width * scale) / 64) * 64
+        # Check if the scaled dimensions meet the pixel constraint
+        if scaled_height * scaled_width <= max_pixels:
+            return scaled_height, scaled_width
+        # Reduce the scale slightly
+        scale -= 0.01
+def convert_to_batch(image, input_focal_position, sample_frames=9):
+    scene, focal_stack_num = image, input_focal_position
+    from PIL import Image
+    with Image.open(scene) as img:
+        icc_profile = img.info.get("icc_profile")
+        if icc_profile is None:
+            icc_profile = "none"
+        original_pixels = torch.from_numpy(np.array(img)).float().permute(2,0,1)
+        original_pixels = original_pixels / 255
+        width, height = img.size
+        scaled_width, scaled_height = find_scale(width, height)
+        img_resized = img.resize((scaled_width, scaled_height))
+        img_tensor = torch.from_numpy(np.array(img_resized)).float()
+        img_normalized = img_tensor / 127.5 - 1
+        img_normalized = img_normalized.permute(2, 0, 1)
+        pixels = torch.zeros((1, sample_frames, 3, scaled_height, scaled_width))
+        pixels[0, focal_stack_num] = img_normalized
+        name = os.path.splitext(os.path.basename(scene))[0]
+        return {"pixel_values": pixels, "focal_stack_num": focal_stack_num, "original_pixel_values": original_pixels, 'icc_profile': icc_profile, "name": name}
+def main():
+    args = parse_args()
+    if args.seed is not None:
+        set_seed(args.seed)
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # inference-only modules
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+        args.pretrained_model_path, subfolder="image_encoder"
+    )
+    vae = AutoencoderKLTemporalDecoder.from_pretrained(
+        args.pretrained_model_path, subfolder="vae",  variant="fp16"
+    )
+    weight_dtype = torch.float32
+    image_encoder.requires_grad_(False).to(device, dtype=weight_dtype)
+    vae.requires_grad_(False).to(device, dtype=weight_dtype)
+    # ---- load UNet from checkpoint root (this reads unet/config.json + diffusion_pytorch_model.safetensors)
+    unet = UNetSpatioTemporalConditionModel.from_pretrained(
+        args.learn2refocus_hf_repo_path, subfolder="checkpoint-200000/unet"
+    ).to(device)
+    batch = convert_to_batch(args.image_path, input_focal_position=6)
+    unet.eval(); image_encoder.eval(); vae.eval()
+    with torch.no_grad():
+        valid_net(args, batch, unet, image_encoder, vae, 0, weight_dtype, device, num_inference_steps=args.num_inference_steps)
+if __name__ == "__main__":
+    main()

simplified_pipeline.py ADDED Viewed

	@@ -0,0 +1,807 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from dataclasses import dataclass
+import random
+from typing import Callable, Dict, List, Optional, Union
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
+from diffusers.schedulers import EulerDiscreteScheduler
+from diffusers.utils import BaseOutput, logging, replace_example_docstring
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+from diffusers.video_processor import VideoProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+import torch.nn.functional as F
+from tqdm import tqdm
+from einops import rearrange
+def tensor_to_vae_latent(t, vae, otype="sample"):
+    video_length = t.shape[1]
+    t = rearrange(t, "b f c h w -> (b f) c h w")
+    if otype == "sample":
+        latents = vae.encode(t).latent_dist.sample()
+    else:
+        latents = vae.encode(t).latent_dist.mode()
+    latents = rearrange(latents, "(b f) c h w -> b f c h w", f=video_length)
+    latents = latents * vae.config.scaling_factor
+    return latents
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import StableVideoDiffusionPipeline
+        >>> from diffusers.utils import load_image, export_to_video
+        >>> pipe = StableVideoDiffusionPipeline.from_pretrained(
+        ...     "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
+        ... )
+        >>> pipe.to("cuda")
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg"
+        ... )
+        >>> image = image.resize((1024, 576))
+        >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        >>> export_to_video(frames, "generated.mp4", fps=7)
+        ```
+"""
+def _append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+@dataclass
+class StableVideoDiffusionPipelineOutput(BaseOutput):
+    r"""
+    Output class for Stable Video Diffusion pipeline.
+    Args:
+        frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
+            List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
+            num_frames, height, width, num_channels)`.
+    """
+    frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]
+class StableVideoDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate video from an input image using Stable Video Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKLTemporalDecoder`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder
+            ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
+        unet ([`UNetSpatioTemporalConditionModel`]):
+            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
+        scheduler ([`EulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images.
+    """
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNetSpatioTemporalConditionModel,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor)
+    def _encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ) -> torch.Tensor:
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.video_processor.pil_to_numpy(image)
+            image = self.video_processor.numpy_to_pt(image)
+            # We normalize the image before resizing to match with the original implementation.
+            # Then we unnormalize it after resizing.
+            image = image * 2.0 - 1.0
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+        # Normalize the image with for CLIP input
+        image = self.feature_extractor(
+            images=image,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
+        return image_embeddings
+    def _encode_vae_image(
+        self,
+        image: torch.Tensor,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.mode()
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+        if do_classifier_free_guidance:
+            negative_image_latents = torch.zeros_like(image_latents)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_latents = torch.cat([negative_image_latents, image_latents])
+        return image_latents
+    def _get_add_time_ids(
+        self,
+        fps: int,
+        motion_bucket_id: int,
+        noise_aug_strength: float,
+        dtype: torch.dtype,
+        batch_size: int,
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+        if do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids, add_time_ids])
+        return add_time_ids
+    def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14):
+        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
+        latents = latents.flatten(0, 1)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+        # decode decode_chunk_size frames at a time to avoid OOM
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+            frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        frames = frames.float()
+        return frames
+    def check_inputs(self, image, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_frames: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: Union[str, torch.device],
+        generator: torch.Generator,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents // 2,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        if isinstance(self.guidance_scale, (int, float)):
+            return self.guidance_scale > 0
+        return self.guidance_scale.max() > 0
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    #@torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        sigmas: Optional[List[float]] = None,
+        min_guidance_scale: float = 1.0,
+        max_guidance_scale: float = 3.0,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+        focal_stack_num: int = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
+                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
+                1]`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
+                `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after
+                generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                Used for conditioning the amount of motion for the generation. The higher the number the more motion
+                will be in the video.
+            noise_aug_strength (`float`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the
+                init image. Increase it for more motion.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
+                expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
+                For lower memory usage, reduce `decode_chunk_size`.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `pil`, `np` or `pt`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that is called at the end of each denoising step during inference. The function is called
+                with the following arguments:
+                    `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
+                `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.Tensor`) is
+                returned.
+        """
+        with torch.no_grad():
+            # 0. Default height and width to unet
+            height = height or self.unet.config.sample_size * self.vae_scale_factor
+            width = width or self.unet.config.sample_size * self.vae_scale_factor
+            num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+            decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(image, height, width)
+            # 2. Define call parameters
+            if isinstance(image, PIL.Image.Image):
+                batch_size = 1
+            elif isinstance(image, list):
+                batch_size = len(image)
+            else:
+                batch_size = image.shape[0]
+            device = self._execution_device
+            # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+            # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+            # corresponds to doing no classifier free guidance.
+            self._guidance_scale = max_guidance_scale
+            # NOTE: Stable Video Diffusion was conditioned on fps - 1, which is why it is reduced here.
+            # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+            fps = fps - 1
+            # 4. Encode input image using VAE
+            # first_image = image[0, 0:1]
+            # first_image = self.video_processor.preprocess(first_image*0.5+0.5, height=height, width=width).to(device)
+            # noise = randn_tensor(first_image.shape, generator=generator, device=device, dtype=image.dtype)
+            # first_image = first_image + noise_aug_strength * noise #you add this noise to have a version of the image that the vae can denoise
+            # first_image = self.video_processor.preprocess(first_image*0.5+0.5, height=height, width=width).to(device)
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float32)
+            image_latents = tensor_to_vae_latent(image, self.vae, otype="mode")/self.vae.config.scaling_factor
+            #noise = randn_tensor(image_latents.shape, generator=generator, device=device, dtype=image.dtype)
+            #image_latents = image_latents + noise_aug_strength * noise #you add this noise to have a version of the image that the vae can denoise
+            # old_image_latents = self._encode_vae_image(
+            #     first_image,
+            #     device=device,
+            #     num_videos_per_prompt=num_videos_per_prompt,
+            #     do_classifier_free_guidance=self.do_classifier_free_guidance,
+            # )
+            if self.do_classifier_free_guidance:
+                negative_image_latents = torch.zeros_like(image_latents)
+                # For classifier free guidance, we need to do two forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing two forward passes
+                image_latents = torch.cat([negative_image_latents, image_latents])
+            image_latents = image_latents.to(torch.float32)
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            # Repeat the image latents for each frame so we can concatenate them with the noise
+            # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
+            #image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+            mask = torch.zeros_like(image_latents)
+            if focal_stack_num is not None:
+                frame_idx = focal_stack_num
+                mask[:, frame_idx] = 1
+            original_image_latents = image_latents.clone()
+            image_latents = image_latents * mask
+            mask = mask == 1 #mask is a boolean tensor
+            clip_image = image[0, frame_idx: frame_idx+1]
+            resized_clip_image = _resize_with_antialiasing(clip_image, (224, 224))
+            image_embeddings = self._encode_image(resized_clip_image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+            if motion_bucket_id is None: #this hits for ablation_time at validation time
+                motion_bucket_id = 0
+            # 5. Get Added Time IDs
+            added_time_ids = self._get_add_time_ids(
+                fps,
+                motion_bucket_id,
+                noise_aug_strength,
+                image_embeddings.dtype,
+                batch_size,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+            added_time_ids = added_time_ids.to(device)
+            # 6. Prepare timesteps
+            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None, sigmas)
+            # 7. Prepare latent variables
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                batch_size * num_videos_per_prompt,
+                num_frames,
+                num_channels_latents,
+                height,
+                width,
+                image_embeddings.dtype,
+                device,
+                generator,
+                latents,
+            )
+            # 8. Prepare guidance scale
+            guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+            guidance_scale = guidance_scale.to(device, latents.dtype)
+            guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+            guidance_scale = _append_dims(guidance_scale, latents.ndim)
+            self._guidance_scale = guidance_scale
+            # 9. Denoising loop
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            self._num_timesteps = len(timesteps)
+            alphas_cumprod = 1 / (1 + self.scheduler.sigmas**2)
+            alphas = alphas_cumprod / torch.cat((torch.tensor([1.0]), alphas_cumprod[:-1]))
+        progress_bar = tqdm(range(num_inference_steps))
+        for i, t in enumerate(timesteps):
+            # expand the latents if we are doing classifier free guidance - this is because we have the unconditional and the conditional portion
+            #this is concatenation along the batch dimension
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            # Concatenate image_latents over channels dimension
+            latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+            # predict the noise residual
+            with torch.no_grad():
+                noise_pred_uncond = self.unet(
+                    latent_model_input[0:1],
+                    t,
+                    encoder_hidden_states=image_embeddings[0:1],
+                    added_time_ids=added_time_ids[0:1],
+                    return_dict=False,
+                )[0]
+                noise_pred_cond = self.unet(
+                    latent_model_input[1:2],
+                    t,
+                    encoder_hidden_states=image_embeddings[1:2],
+                    added_time_ids=added_time_ids[1:2],
+                    return_dict=False,
+                )[0]
+            with torch.no_grad():
+                noise_pred = torch.cat([noise_pred_uncond, noise_pred_cond])
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+            with torch.no_grad():
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        with torch.no_grad():
+            if not output_type == "latent":
+                # cast back to fp16 if needed
+                if needs_upcasting:
+                    self.vae.to(dtype=torch.float16)
+                frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+                gt = self.decode_latents(original_image_latents[1:2]*self.vae.config.scaling_factor, num_frames, decode_chunk_size)
+            else:
+                frames = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames), gt
+# resizing utils
+# TODO: clean up later
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+    input = _gaussian_blur2d(input, ks, sigmas)
+    output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+    return out_padding
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype)
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+    height, width = tmp_kernel.shape[-2:]
+    padding_shape: List[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+    out = output.view(b, c, h, w)
+    return out
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+    batch_size = sigma.shape[0]
+    x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+    if window_size % 2 == 0:
+        x = x + 0.5
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+    return gauss / gauss.sum(-1, keepdim=True)
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+    return out

simplified_validation.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from simplified_pipeline import StableVideoDiffusionPipeline
+import os
+import torch
+import numpy as np
+import videoio
+import matplotlib.image
+from PIL import Image
+def valid_net(args, batch, unet, image_encoder, vae, global_step, weight_dtype, device):
+    # The models need unwrapping because for compatibility in distributed training mode.
+    pipeline = StableVideoDiffusionPipeline.from_pretrained(
+        args.pretrained_model_path,
+        unet=unet,
+        image_encoder=image_encoder,
+        vae=vae,
+        torch_dtype=weight_dtype,
+    )
+    pipeline.set_progress_bar_config(disable=True)
+    # run inference
+    val_save_dir = os.path.join(
+        args.output_dir, "validation_images")
+    print("Validation images will be saved to ", val_save_dir)
+    os.makedirs(val_save_dir, exist_ok=True)
+    num_frames = 9
+    unet.eval()
+    #clear gradients (the torch no grad is the magic that makes this work)
+    with torch.no_grad():
+        torch.cuda.empty_cache()
+    pixel_values = batch["pixel_values"].to(device)
+    original_pixel_values = batch['original_pixel_values'].to(device)
+    focal_stack_num = batch["focal_stack_num"]
+    svd_output, gt_frames = pipeline(
+        pixel_values,
+        height=pixel_values.shape[3],
+        width=pixel_values.shape[4],
+        num_frames=num_frames,
+        decode_chunk_size=8,
+        motion_bucket_id=0,
+        min_guidance_scale=1.5,
+        max_guidance_scale=1.5,
+        fps=7,
+        noise_aug_strength=0,
+        focal_stack_num = focal_stack_num,
+        num_inference_steps=args.num_inference_steps,
+    )
+    video_frames = svd_output.frames[0]
+    gt_frames = gt_frames[0]
+    with torch.no_grad():
+        if len(original_pixel_values.shape) == 5:
+            pixel_values = original_pixel_values[0] #assuming batch size is 1
+        else:
+            pixel_values = original_pixel_values.repeat(num_frames, 1, 1, 1)
+        pixel_values_normalized = pixel_values*0.5 + 0.5
+        pixel_values_normalized = torch.clamp(pixel_values_normalized,0,1)
+        video_frames_normalized = video_frames*0.5 + 0.5
+        video_frames_normalized = torch.clamp(video_frames_normalized,0,1)
+        video_frames_normalized = video_frames_normalized.permute(1,0,2,3)
+        gt_frames = torch.clamp(gt_frames,0,1)
+        gt_frames = gt_frames.permute(1,0,2,3)
+        #RESIZE images
+        video_frames_normalized = torch.nn.functional.interpolate(video_frames_normalized, ((pixel_values.shape[2]//2)*2, (pixel_values.shape[3]//2)*2), mode='bilinear')
+        gt_frames = torch.nn.functional.interpolate(gt_frames, ((pixel_values.shape[2]//2)*2, (pixel_values.shape[3]//2)*2), mode='bilinear')
+        pixel_values_normalized = torch.nn.functional.interpolate(pixel_values_normalized, ((pixel_values.shape[2]//2)*2, (pixel_values.shape[3]//2)*2), mode='bilinear')
+        os.makedirs(os.path.join(val_save_dir, f"position_{focal_stack_num}/videos"), exist_ok=True)
+        videoio.videosave(os.path.join(
+            val_save_dir,
+            f"position_{focal_stack_num}/videos/{batch['name']}.mp4",
+        ), video_frames_normalized.permute(0,2,3,1).cpu().numpy(), fps=5)
+        #save images
+        os.makedirs(os.path.join(val_save_dir, f"position_{focal_stack_num}/images"), exist_ok=True)
+        for i in range(num_frames):
+            #use Pillow to save images
+            img = Image.fromarray((video_frames_normalized[i].permute(1,2,0).cpu().numpy()*255).astype(np.uint8))
+            #use index to assign icc profile to img
+            if batch['icc_profile'] != "none":
+                img.info['icc_profile'] = batch['icc_profile']
+            path = os.path.join(val_save_dir, f"position_{focal_stack_num}/images/{batch['name']}_frame_{i}.png")
+            print("Saving image to ", path)
+            img.save(os.path.join(val_save_dir, f"position_{focal_stack_num}/images/{batch['name']}_frame_{i}.png"))
+    del video_frames

splits/test_scenes.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b7d6ac77b97cf4b5fa62ffa13df88fc6dec2dfe4d5fbc981b79373c4766b86a
+size 4936

splits/train_scenes.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec9f60c6cee001b10f0f0928f24d4fafc54ec6c3d9ed1e34069b3c0da0e8e570
+size 44238

training/configs/accelerator_config.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+compute_environment: LOCAL_MACHINE
+main_process_port: 29501
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+dynamo_backend: 'no'
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

training/configs/focal_stacks_test.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+data_folder: "/datasets/sai/scenes_merged"
+splits_dir: "./splits" #all split.pkl files are stored here
+pretrained_model_name_or_path: "./svdh"
+load_from_checkpoint: "./checkpoints/checkpoint-200000"
+output_dir: "./outputs/focal_stacks_test"
+wandb_project: "RefocusingSVD"
+run_name: "focal_stacks_test"
+test: true
+revision: null
+num_frames: 9
+num_validation_images: 1
+validation_steps: 1000
+photos: false
+conditioning: "random"
+seed: 0
+per_gpu_batch_size: 1
+num_train_epochs: 600
+max_train_steps: null
+gradient_accumulation_steps: 1
+gradient_checkpointing: false
+learning_rate: 0.00001
+reconstruction_guidance: 0
+scale_lr: true
+lr_scheduler: "constant"
+lr_warmup_steps: 0
+conditioning_dropout_prob: 0.1
+use_8bit_adam: false
+allow_tf32: false
+use_ema: false
+non_ema_revision: null
+num_workers: 32
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_weight_decay: 0.01
+adam_epsilon: 0.0000001
+max_grad_norm: 1.0
+push_to_hub: false
+hub_token: null
+hub_model_id: null
+logging_dir: "logs"
+mixed_precision: null
+report_to: "wandb"
+local_rank: -1
+checkpointing_steps: 500
+checkpoints_total_limit: 2
+enable_xformers_memory_efficient_attention: false
+pretrain_unet: null

training/configs/focal_stacks_train.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+data_folder: "/datasets/sai/scenes_merged"
+splits_dir: "./splits" #all split.pkl files are stored here
+pretrained_model_name_or_path: "./svdh"
+load_from_checkpoint: null
+output_dir: "./outputs/focal_stacks_train"
+wandb_project: "RefocusingSVD"
+run_name: "focal_stacks_train"
+test: false
+revision: null
+num_frames: 9
+num_validation_images: 1
+validation_steps: 1000
+photos: false
+conditioning: "random"
+seed: 0
+per_gpu_batch_size: 1
+num_train_epochs: 600
+max_train_steps: null
+gradient_accumulation_steps: 1
+gradient_checkpointing: false
+learning_rate: 0.00001
+reconstruction_guidance: 0
+scale_lr: true
+lr_scheduler: "constant"
+lr_warmup_steps: 0
+conditioning_dropout_prob: 0.1
+use_8bit_adam: false
+allow_tf32: false
+use_ema: false
+non_ema_revision: null
+num_workers: 32
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_weight_decay: 0.01
+adam_epsilon: 0.0000001
+max_grad_norm: 1.0
+push_to_hub: false
+hub_token: null
+hub_model_id: null
+logging_dir: "logs"
+mixed_precision: null
+report_to: "wandb"
+local_rank: -1
+checkpointing_steps: 500
+checkpoints_total_limit: 2
+enable_xformers_memory_efficient_attention: false
+pretrain_unet: null

training/configs/outside_photos.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+photos: true # Use outside photos
+data_folder: "./photos"
+pretrained_model_name_or_path: "./svdh"
+load_from_checkpoint: "./checkpoints/checkpoint-200000"
+output_dir: "./outputs/outside_photos"
+wandb_project: "RefocusingSVD"
+run_name: "outside_photos"
+test: true
+revision: null
+num_frames: 9
+num_validation_images: 1
+validation_steps: 1000
+conditioning: "random"
+seed: 0
+per_gpu_batch_size: 1
+num_train_epochs: 600
+max_train_steps: null
+gradient_accumulation_steps: 1
+gradient_checkpointing: false
+learning_rate: 0.00001
+reconstruction_guidance: 0
+scale_lr: true
+lr_scheduler: "constant"
+lr_warmup_steps: 0
+conditioning_dropout_prob: 0.1
+use_8bit_adam: false
+allow_tf32: false
+use_ema: false
+non_ema_revision: null
+num_workers: 32
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_weight_decay: 0.01
+adam_epsilon: 0.0000001
+max_grad_norm: 1.0
+push_to_hub: false
+hub_token: null
+hub_model_id: null
+logging_dir: "logs"
+mixed_precision: null
+report_to: "wandb"
+local_rank: -1
+checkpointing_steps: 500
+checkpoints_total_limit: 2
+enable_xformers_memory_efficient_attention: false
+pretrain_unet: null

training/svd_pipeline.py ADDED Viewed

	@@ -0,0 +1,828 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from dataclasses import dataclass
+import random
+from typing import Callable, Dict, List, Optional, Union
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
+from diffusers.schedulers import EulerDiscreteScheduler
+from diffusers.utils import BaseOutput, logging, replace_example_docstring
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+from diffusers.video_processor import VideoProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+import torch.nn.functional as F
+from tqdm import tqdm
+from utils import tensor_to_vae_latent
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import StableVideoDiffusionPipeline
+        >>> from diffusers.utils import load_image, export_to_video
+        >>> pipe = StableVideoDiffusionPipeline.from_pretrained(
+        ...     "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
+        ... )
+        >>> pipe.to("cuda")
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg"
+        ... )
+        >>> image = image.resize((1024, 576))
+        >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        >>> export_to_video(frames, "generated.mp4", fps=7)
+        ```
+"""
+def _append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+@dataclass
+class StableVideoDiffusionPipelineOutput(BaseOutput):
+    r"""
+    Output class for Stable Video Diffusion pipeline.
+    Args:
+        frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
+            List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
+            num_frames, height, width, num_channels)`.
+    """
+    frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]
+class StableVideoDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate video from an input image using Stable Video Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKLTemporalDecoder`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder
+            ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
+        unet ([`UNetSpatioTemporalConditionModel`]):
+            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
+        scheduler ([`EulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images.
+    """
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNetSpatioTemporalConditionModel,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor)
+    def _encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ) -> torch.Tensor:
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.video_processor.pil_to_numpy(image)
+            image = self.video_processor.numpy_to_pt(image)
+            # We normalize the image before resizing to match with the original implementation.
+            # Then we unnormalize it after resizing.
+            image = image * 2.0 - 1.0
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+        # Normalize the image with for CLIP input
+        image = self.feature_extractor(
+            images=image,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
+        return image_embeddings
+    def _encode_vae_image(
+        self,
+        image: torch.Tensor,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.mode()
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+        if do_classifier_free_guidance:
+            negative_image_latents = torch.zeros_like(image_latents)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_latents = torch.cat([negative_image_latents, image_latents])
+        return image_latents
+    def _get_add_time_ids(
+        self,
+        fps: int,
+        motion_bucket_id: int,
+        noise_aug_strength: float,
+        dtype: torch.dtype,
+        batch_size: int,
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+        if do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids, add_time_ids])
+        return add_time_ids
+    def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14):
+        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
+        latents = latents.flatten(0, 1)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+        # decode decode_chunk_size frames at a time to avoid OOM
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+            frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        frames = frames.float()
+        return frames
+    def check_inputs(self, image, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_frames: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: Union[str, torch.device],
+        generator: torch.Generator,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents // 2,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        if isinstance(self.guidance_scale, (int, float)):
+            return self.guidance_scale > 0
+        return self.guidance_scale.max() > 0
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    #@torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        sigmas: Optional[List[float]] = None,
+        min_guidance_scale: float = 1.0,
+        max_guidance_scale: float = 3.0,
+        reconstruction_guidance_scale: float = 2.0,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+        conditioning: str = "zero",
+        focal_stack_num: int = None,
+        accelerator=None,
+        weight_dtype=None,
+        zero=0
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
+                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
+                1]`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
+                `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after
+                generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                Used for conditioning the amount of motion for the generation. The higher the number the more motion
+                will be in the video.
+            noise_aug_strength (`float`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the
+                init image. Increase it for more motion.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
+                expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
+                For lower memory usage, reduce `decode_chunk_size`.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `pil`, `np` or `pt`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that is called at the end of each denoising step during inference. The function is called
+                with the following arguments:
+                    `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
+                `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.Tensor`) is
+                returned.
+        """
+        with torch.no_grad():
+            # 0. Default height and width to unet
+            height = height or self.unet.config.sample_size * self.vae_scale_factor
+            width = width or self.unet.config.sample_size * self.vae_scale_factor
+            num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+            decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(image, height, width)
+            # 2. Define call parameters
+            if isinstance(image, PIL.Image.Image):
+                batch_size = 1
+            elif isinstance(image, list):
+                batch_size = len(image)
+            else:
+                batch_size = image.shape[0]
+            device = self._execution_device
+            # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+            # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+            # corresponds to doing no classifier free guidance.
+            self._guidance_scale = max_guidance_scale
+            # NOTE: Stable Video Diffusion was conditioned on fps - 1, which is why it is reduced here.
+            # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+            fps = fps - 1
+            # 4. Encode input image using VAE
+            # first_image = image[0, 0:1]
+            # first_image = self.video_processor.preprocess(first_image*0.5+0.5, height=height, width=width).to(device)
+            # noise = randn_tensor(first_image.shape, generator=generator, device=device, dtype=image.dtype)
+            # first_image = first_image + noise_aug_strength * noise #you add this noise to have a version of the image that the vae can denoise
+            # first_image = self.video_processor.preprocess(first_image*0.5+0.5, height=height, width=width).to(device)
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float32)
+            image_latents = tensor_to_vae_latent(image, self.vae, otype="mode")/self.vae.config.scaling_factor
+            #noise = randn_tensor(image_latents.shape, generator=generator, device=device, dtype=image.dtype)
+            #image_latents = image_latents + noise_aug_strength * noise #you add this noise to have a version of the image that the vae can denoise
+            # old_image_latents = self._encode_vae_image(
+            #     first_image,
+            #     device=device,
+            #     num_videos_per_prompt=num_videos_per_prompt,
+            #     do_classifier_free_guidance=self.do_classifier_free_guidance,
+            # )
+            if self.do_classifier_free_guidance:
+                negative_image_latents = torch.zeros_like(image_latents)
+                # For classifier free guidance, we need to do two forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing two forward passes
+                image_latents = torch.cat([negative_image_latents, image_latents])
+            image_latents = image_latents.to(torch.float32)
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            # Repeat the image latents for each frame so we can concatenate them with the noise
+            # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
+            #image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+            mask = torch.zeros_like(image_latents)
+            if focal_stack_num is not None:
+                frame_idx = focal_stack_num
+                mask[:, frame_idx] = 1
+            elif conditioning == "zero":
+                frame_idx = 0
+                mask[:, 0] = 1
+            elif conditioning == "random":
+                rand_idx = np.random.randint(0, num_frames) #randomly choose a frame to condition on between 0 and 8 (inclusive)
+                frame_idx = rand_idx
+                mask[:, rand_idx] = 1
+            elif conditioning in ["ablate_position", "ablate_time"]:
+                frame_idx = 0 #zero for simple testing (this won't be hit at testing time)
+            elif conditioning == "five":
+                frame_idx = 4
+                mask[:, 4] = 1
+            original_image_latents = image_latents.clone()
+            if conditioning in ["ablate_position", "ablate_time"]:
+                image_latents = image_latents[:, frame_idx:frame_idx+1].repeat(1,num_frames, 1, 1, 1)
+            else:
+                image_latents = image_latents * mask
+            mask = mask == 1 #mask is a boolean tensor
+            clip_image = image[0, frame_idx: frame_idx+1]
+            resized_clip_image = _resize_with_antialiasing(clip_image, (224, 224))
+            image_embeddings = self._encode_image(resized_clip_image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+            if motion_bucket_id is None: #this hits for ablation_time at validation time
+                motion_bucket_id = 0
+            # 5. Get Added Time IDs
+            added_time_ids = self._get_add_time_ids(
+                fps,
+                motion_bucket_id,
+                noise_aug_strength,
+                image_embeddings.dtype,
+                batch_size,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+            added_time_ids = added_time_ids.to(device)
+            # 6. Prepare timesteps
+            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None, sigmas)
+            # 7. Prepare latent variables
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                batch_size * num_videos_per_prompt,
+                num_frames,
+                num_channels_latents,
+                height,
+                width,
+                image_embeddings.dtype,
+                device,
+                generator,
+                latents,
+            )
+            # 8. Prepare guidance scale
+            guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+            guidance_scale = guidance_scale.to(device, latents.dtype)
+            guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+            guidance_scale = _append_dims(guidance_scale, latents.ndim)
+            self._guidance_scale = guidance_scale
+            # 9. Denoising loop
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            self._num_timesteps = len(timesteps)
+            alphas_cumprod = 1 / (1 + self.scheduler.sigmas**2)
+            alphas = alphas_cumprod / torch.cat((torch.tensor([1.0]), alphas_cumprod[:-1]))
+        progress_bar = tqdm(range(num_inference_steps), disable=not accelerator.is_local_main_process)
+        for i, t in enumerate(timesteps):
+            # expand the latents if we are doing classifier free guidance - this is because we have the unconditional and the conditional portion
+            #this is concatenation along the batch dimension
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            # Concatenate image_latents over channels dimension
+            latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+            # predict the noise residual
+            with torch.no_grad():
+                noise_pred_uncond = self.unet(
+                    latent_model_input[0:1],
+                    t,
+                    encoder_hidden_states=image_embeddings[0:1],
+                    added_time_ids=added_time_ids[0:1],
+                    return_dict=False,
+                )[0]
+                noise_pred_cond = self.unet(
+                    latent_model_input[1:2],
+                    t,
+                    encoder_hidden_states=image_embeddings[1:2],
+                    added_time_ids=added_time_ids[1:2],
+                    return_dict=False,
+                )[0]
+            with torch.no_grad():
+                noise_pred = torch.cat([noise_pred_uncond, noise_pred_cond])
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+            if self.scheduler._step_index < len(timesteps) and reconstruction_guidance_scale > 0:
+                noise_pred = self.unet(
+                    torch.cat([latents, image_latents[1:2]], dim=2),
+                    t,
+                    encoder_hidden_states=image_embeddings[1:2],
+                    added_time_ids=added_time_ids[1:2],
+                    return_dict=False,
+                )[0]
+                reconstructed_latent_cond = self.scheduler.step(noise_pred, t, latents).pred_original_sample  #x_0 - given the noise
+                self.scheduler._step_index-=1 #remove the step
+                reconstruction_loss = F.mse_loss((image_latents[1, mask[1]]).to(torch.float32)*self.vae.config.scaling_factor, reconstructed_latent_cond[mask[1:2]], reduction="mean")  #Squared L2 loss
+                reconstruction_grad = torch.autograd.grad(reconstruction_loss, reconstructed_latent_cond, retain_graph=True)[0]
+                accelerator.backward(reconstruction_loss)
+                latents = latents - reconstruction_guidance_scale*alphas[self.scheduler.step_index]*reconstruction_grad
+            with torch.no_grad():
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        with torch.no_grad():
+            if not output_type == "latent":
+                # cast back to fp16 if needed
+                if needs_upcasting:
+                    self.vae.to(dtype=torch.float16)
+                frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+                gt = self.decode_latents(original_image_latents[1:2]*self.vae.config.scaling_factor, num_frames, decode_chunk_size)
+            else:
+                frames = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames), gt
+# resizing utils
+# TODO: clean up later
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+    input = _gaussian_blur2d(input, ks, sigmas)
+    output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+    return out_padding
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype)
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+    height, width = tmp_kernel.shape[-2:]
+    padding_shape: List[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+    out = output.view(b, c, h, w)
+    return out
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+    batch_size = sigma.shape[0]
+    x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+    if window_size % 2 == 0:
+        x = x + 0.5
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+    return gauss / gauss.sum(-1, keepdim=True)
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+    return out

training/svd_runner.py ADDED Viewed

	@@ -0,0 +1,683 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Script to fine-tune Stable Video Diffusion."""
+from datetime import datetime
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+import accelerate
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.utils.data import RandomSampler
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from validation import valid_net
+import diffusers
+from svd_pipeline import StableVideoDiffusionPipeline
+from diffusers.models.lora import LoRALinearLayer
+from diffusers import AutoencoderKLTemporalDecoder, EulerDiscreteScheduler, UNetSpatioTemporalConditionModel
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, deprecate, is_wandb_available, load_image
+from diffusers.utils.import_utils import is_xformers_available
+from utils import parse_args, FocalStackDataset, OutsidePhotosDataset, rand_log_normal, tensor_to_vae_latent, load_image, _resize_with_antialiasing, encode_image, get_add_time_ids
+import wandb
+import random
+from random import choices
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.24.0.dev0")
+logger = get_logger(__name__, log_level="INFO")
+import numpy as np
+import PIL.Image
+import torch
+from typing import Callable, Dict, List, Optional, Union
+import os
+def main():
+    args = parse_args()
+    #SETUP PYTORCH CUDA - Without this I have memory overflow
+    #pytorch 2.4.1 is important for this to work
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+    if not is_wandb_available():
+        raise ImportError(
+            "Make sure to install wandb if you want to use it for logging during training.")
+    import wandb
+    currentSecond= datetime.now().second
+    currentMinute = datetime.now().minute
+    currentHour = datetime.now().hour
+    currentDay = datetime.now().day
+    currentMonth = datetime.now().month
+    currentYear = datetime.now().year
+    if args.non_ema_revision is not None:
+        deprecate(
+            "non_ema_revision!=None",
+            "0.15.0",
+            message=(
+                "Downloading 'non_ema' weights from revision branches of the Hub is deprecated. Please make sure to"
+                " use `--variant=non_ema` instead."
+            ),
+        )
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(
+        project_dir=args.output_dir, logging_dir=logging_dir)
+    ddp_kwargs = accelerate.DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[ddp_kwargs]
+    )
+    accelerator.init_trackers(
+        project_name=args.wandb_project,
+        init_kwargs={"wandb": { "name" : args.run_name}}
+    )
+    generator = torch.Generator(
+        device=accelerator.device).manual_seed(args.seed)
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+    # Load img encoder, tokenizer and models.
+    feature_extractor = CLIPImageProcessor.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="feature_extractor", revision=args.revision
+    )
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="image_encoder", revision=args.revision
+    )
+    vae = AutoencoderKLTemporalDecoder.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant="fp16")
+    unet = UNetSpatioTemporalConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path if args.pretrain_unet is None else args.pretrain_unet,
+        subfolder="unet",
+        low_cpu_mem_usage=True,
+        variant="fp16"
+    )
+    #unet= UNetSpatioTemporalConditionModel()
+    # Freeze vae and image_encoder
+    vae.requires_grad_(False)
+    image_encoder.requires_grad_(False)
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move image_encoder and vae to gpu and cast to weight_dtype
+    image_encoder.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+        # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = EMAModel(unet.parameters(
+        ), model_cls=UNetSpatioTemporalConditionModel, model_config=unet.config, use_ema_warmup=True, inv_gamma=1, ower=3/4)
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warn(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError(
+                "xformers is not available. Make sure it is installed correctly")
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if args.use_ema:
+                ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+            for i, model in enumerate(models):
+                model.save_pretrained(os.path.join(output_dir, "unet"))
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(
+                    input_dir, "unet_ema"), UNetSpatioTemporalConditionModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+            for i in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+                # load diffusers style into model
+                load_model = UNetSpatioTemporalConditionModel.from_pretrained(
+                    input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps *
+            args.per_gpu_batch_size * accelerator.num_processes
+        )
+    optimizer_cls = torch.optim.AdamW
+    parameters_list = []
+    # Customize the parameters that need to be trained; if necessary, you can uncomment them yourself.
+    for name, param in unet.named_parameters():
+        parameters_list.append(param)
+        if 'temporal_transformer_block' in name: #or 'conv_norm_out' in name or 'conv_out' in name or 'conv_in' in name or 'spatial_res_block' in name or 'up_block' in name:
+            parameters_list.append(param)
+            param.requires_grad = True
+        else:
+            param.requires_grad = False
+    zero_latent = 0
+    optimizer = optimizer_cls(
+        parameters_list,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    # DataLoaders creation:
+    args.global_batch_size = args.per_gpu_batch_size * accelerator.num_processes
+    if args.photos:
+        train_dataset = OutsidePhotosDataset(data_folder=args.data_folder, sample_frames=args.num_frames)
+        val_dataset = OutsidePhotosDataset(data_folder=args.data_folder, sample_frames=args.num_frames)
+    else:
+        train_dataset = FocalStackDataset(args.data_folder,  args.splits_dir, sample_frames=args.num_frames, split="train")
+        val_dataset = FocalStackDataset(args.data_folder, args.splits_dir, sample_frames=args.num_frames, split="val" if not args.test else "test")
+    sampler = RandomSampler(train_dataset)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        sampler=sampler,
+        batch_size=args.per_gpu_batch_size,
+        num_workers=args.num_workers,
+        drop_last=True
+    )
+    val_dataloader = torch.utils.data.DataLoader(
+        val_dataset,
+        batch_size=args.per_gpu_batch_size,
+        num_workers=args.num_workers,
+        shuffle=False,
+    )
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, lr_scheduler, train_dataloader, val_dataloader = accelerator.prepare(
+     unet, optimizer, lr_scheduler, train_dataloader, val_dataloader
+    )
+    if args.use_ema:
+        ema_unet.to(accelerator.device)
+    # attribute handling for models using DDP
+    if isinstance(unet, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)):
+        unet = unet.module
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(
+        args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("SVDXtend", config=vars(args))
+    # Train!
+    total_batch_size = args.per_gpu_batch_size * \
+        accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(
+        f"  Instantaneous batch size per device = {args.per_gpu_batch_size}")
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(
+        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    if args.load_from_checkpoint:
+        path = args.load_from_checkpoint
+#
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.load_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.load_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(path, strict=False)
+            global_step = int(os.path.basename(path).split("-")[1])
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (
+                num_update_steps_per_epoch * args.gradient_accumulation_steps)
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps),
+                        disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    # print("ARGS PHOTOS: ", args.photos)
+    # if args.photos:
+    #     print("MAKING OUTSIDE PHOTOS DATASET")
+    #     train_dataset = OutsidePhotosDataset(data_folder=args.data_folder, sample_frames=args.num_frames)
+    #     val_dataset = OutsidePhotosDataset(data_folder=args.data_folder, sample_frames=args.num_frames)
+    #     sampler = RandomSampler(train_dataset)
+    #     train_dataloader = torch.utils.data.DataLoader(
+    #         train_dataset,
+    #         sampler=sampler,
+    #         batch_size=args.per_gpu_batch_size,
+    #         num_workers=args.num_workers,
+    #         drop_last=True
+    #     )
+    #     val_dataloader = torch.utils.data.DataLoader(
+    #         val_dataset,
+    #         batch_size=args.per_gpu_batch_size,
+    #         num_workers=args.num_workers,
+    #         shuffle=False,
+    #     )
+    #     train_dataloader, val_dataloader = accelerator.prepare(
+    #         train_dataloader, val_dataloader)
+    if args.test:
+        first_epoch = 0 #just so I enter loop for test (regardless of training iterations)
+    for epoch in range(first_epoch, args.num_train_epochs):
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            unet.train()
+            if not args.test:
+                with accelerator.accumulate(unet):
+                    # first, convert images to latent space.
+                    pixel_values = batch["pixel_values"].to(weight_dtype).to(
+                        accelerator.device, non_blocking=True
+                    )
+                    conditional_pixel_values = pixel_values
+                    latents = tensor_to_vae_latent(pixel_values, vae, otype="sample")
+                    noise = torch.randn_like(latents)
+                    bsz = latents.shape[0]
+                    cond_sigmas = rand_log_normal(shape=[bsz,], loc=-3.0, scale=0.5).to(latents)
+                    noise_aug_strength = cond_sigmas[0] # TODO: support batch > 1
+                    cond_sigmas = cond_sigmas[:, None, None, None, None]
+                    conditional_pixel_values = \
+                    torch.randn_like(conditional_pixel_values) * cond_sigmas + conditional_pixel_values #- Comment this out as I don't want to add noise to the cond
+                    conditional_latents = tensor_to_vae_latent(conditional_pixel_values, vae, otype="sample")
+                    conditional_latents = conditional_latents / vae.config.scaling_factor #
+                    ##you do noisy conditioning for the
+                    # Sample a random timestep for each image
+                    # P_mean=0.7 P_std=1.6
+                    sigmas = rand_log_normal(shape=[bsz,], loc=0.7, scale=1.6).to(latents.device)
+                    # Add noise to the latents according to the noise magnitude at each timestep
+                    # (this is the forward diffusion process)
+                    sigmas = sigmas[:, None, None, None, None]
+                    noisy_latents = latents + noise * sigmas
+                    timesteps = torch.Tensor(
+                        [0.25 * sigma.log() for sigma in sigmas]).to(accelerator.device)
+                    inp_noisy_latents = noisy_latents / ((sigmas**2 + 1) ** 0.5)
+                    conditioning = args.conditioning
+                    # Create a tensor of zeros with the same shape as the repeated conditional_latents
+                    if conditioning == "zero":
+                        random_frames = [0]
+                    elif conditioning == "random":
+                        #choose a random number between 0 and 8 inclusive
+                        random_frames = [np.random.randint(0, args.num_frames)]
+                    elif conditioning in ["ablate_position", "ablate_time"] :
+                        random_frames = [np.random.randint(0, args.num_frames)]
+                    elif conditioning == "ablate_single_frame":
+                        input_random_frame = np.random.randint(0, args.num_frames)
+                        output_random_frame = np.random.randint(0, args.num_frames)
+                    elif conditioning == "random_single_double_triple":
+                        num_imgs = random.randint(1, 3)
+                        random_frames = choices(range(args.num_frames), k=num_imgs)
+                    # Get the text embedding for conditioning.
+                    encoder_hidden_states = encode_image(
+                        pixel_values[:, random_frames[0], :, :, :].float(),
+                        feature_extractor, image_encoder, weight_dtype, accelerator)
+                    # Here I input a fixed numerical value for 'motion_bucket_id', which is not reasonable.
+                    # However, I am unable to fully align with the calculation method of the motion score,
+                    # so I adopted this approach. The same applies to the 'fps' (frames per second).
+                    conditioning_num = 0
+                    if conditioning != "ablate_time":
+                        conditioning_num = 0
+                    else:
+                        conditioning_num = random_frames[0]
+                    added_time_ids = get_add_time_ids(
+                        7, # fixed
+                        conditioning_num, # motion_bucket_id = 127, fixed
+                        noise_aug_strength, # noise_aug_strength == cond_sigmas
+                        encoder_hidden_states.dtype,
+                        bsz,
+                        unet
+                    )
+                    added_time_ids = added_time_ids.to(latents.device)
+                    # Conditioning dropout to support classifier-free guidance during inference. For more details
+                    # check out the section 3.2.1 of the original paper https://arxiv.org/abs/2211.0args.num_frames800.
+                    if args.conditioning_dropout_prob is not None:
+                        random_p = torch.rand(
+                            bsz, device=latents.device, generator=generator)
+                        # Sample masks for the edit prompts. - I'm not sure if prompts are used in this model. Sam ewith the text conditioning that comes next.
+                        #oh encoder_hidden_states is derived form the image.
+                        prompt_mask = random_p < 2 * args.conditioning_dropout_prob
+                        prompt_mask = prompt_mask.reshape(bsz, 1, 1)
+                        # Final text conditioning.
+                        null_conditioning = torch.zeros_like(encoder_hidden_states)
+                        encoder_hidden_states = torch.where(
+                            prompt_mask, null_conditioning.unsqueeze(1), encoder_hidden_states.unsqueeze(1))
+                        # Sample masks for the original images.
+                        image_mask_dtype = conditional_latents.dtype
+                        image_mask = 1 - (
+                            (random_p >= args.conditioning_dropout_prob).to(
+                                image_mask_dtype)
+                            * (random_p < 3 * args.conditioning_dropout_prob).to(image_mask_dtype)
+                        )
+                        image_mask = image_mask.reshape(bsz, 1, 1, 1)
+                        # Final image conditioning.
+                        conditional_latents = image_mask * conditional_latents #this basically 0s out some of the image latents
+                    # Concatenate the `conditional_latents` with the `noisy_latents`.
+                    # conditional_latents = conditional_latents.unsqueeze(
+                    #     1).repeat(1, noisy_latents.shape[1], 1, 1, 1)
+                    if conditioning == "ablate_single_frame":
+                        #put input frame at first frame
+                        conditional_latents = conditional_latents[:, 0:1].repeat(1, args.num_frames, 1, 1, 1)
+                    elif conditioning in ["ablate_position", "ablate_time"]:
+                        conditional_latents = conditional_latents[:, random_frames[0]:random_frames[0]+1].repeat(1,args.num_frames, 1, 1, 1)
+                    else:
+                        mask = torch.zeros_like(conditional_latents)
+                        #choose a random frame to allow for the model to learn to focus on different frames (set mask to 1 for that frame)
+                        mask[:, random_frames] = 1
+                        conditional_latents = conditional_latents * mask
+                    inp_noisy_latents = torch.cat(
+                        [inp_noisy_latents, conditional_latents], dim=2)
+                    # check https://arxiv.org/abs/2206.00364(the EDM-framework) for more details.
+                    target = latents
+                    model_pred = unet(
+                        inp_noisy_latents, timesteps, encoder_hidden_states, added_time_ids=added_time_ids).sample
+                    # Denoise the latents
+                    c_out = -sigmas / ((sigmas**2 + 1)**0.5)
+                    c_skip = 1 / (sigmas**2 + 1)
+                    denoised_latents = model_pred * c_out + c_skip * noisy_latents
+                    weighing = (1 + sigmas ** 2) * (sigmas**-2.0)
+                    # MSE loss
+                    loss = torch.mean(
+                        (weighing.float() * (denoised_latents.float() -
+                        target.float()) ** 2).reshape(target.shape[0], -1),
+                        dim=1,
+                    )
+                    loss = loss.mean()
+                    # Gather the losses across all processes for logging (if we use distributed training).
+                    avg_loss = accelerator.gather(
+                        loss.repeat(args.per_gpu_batch_size)).mean()
+                    train_loss += avg_loss.item() / args.gradient_accumulation_steps
+                    # Backpropagate
+                    accelerator.backward(loss)
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    if args.use_ema:
+                        ema_unet.step(unet.parameters())
+                    progress_bar.update(1)
+                    global_step += 1
+                    accelerator.log({"train_loss": train_loss}, step=global_step)
+                    train_loss = 0.0
+                    if accelerator.is_main_process:
+                        # save checkpoints!
+                        if global_step % args.checkpointing_steps == 0:
+                            # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                            if args.checkpoints_total_limit is not None:
+                                checkpoints = os.listdir(args.output_dir)
+                                checkpoints = [
+                                    d for d in checkpoints if d.startswith("checkpoint")]
+                                checkpoints = sorted(
+                                    checkpoints, key=lambda x: int(x.split("-")[1]))
+                                # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                                if len(checkpoints) >= args.checkpoints_total_limit:
+                                    num_to_remove = len(
+                                        checkpoints) - args.checkpoints_total_limit + 1
+                                    removing_checkpoints = checkpoints[0:num_to_remove]
+                                    logger.info(
+                                        f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                    )
+                                    logger.info(
+                                        f"removing checkpoints: {', '.join(removing_checkpoints)}")
+                                    for removing_checkpoint in removing_checkpoints:
+                                        removing_checkpoint = os.path.join(
+                                            args.output_dir, removing_checkpoint)
+                                        shutil.rmtree(removing_checkpoint)
+                            save_path = os.path.join(
+                                args.output_dir, f"checkpoint-{global_step}")
+                            accelerator.save_state(save_path)
+                            logger.info(f"Saved state to {save_path}")
+            # sample images!
+            if args.test or (global_step % args.validation_steps == 0) or (global_step == 1):
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_unet.store(unet.parameters())
+                    ema_unet.copy_to(unet.parameters())
+                valid_net(args, val_dataset, val_dataloader, unet, image_encoder, vae, zero_latent, accelerator, global_step, weight_dtype)
+                if args.use_ema:
+                    # Switch back to the original UNet parameters.
+                    ema_unet.restore(unet.parameters())
+                if args.test:
+                    break
+                torch.cuda.empty_cache()
+            logs = {"step_loss": loss.detach().item(
+            ), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            if global_step >= args.max_train_steps:
+                break
+        if args.test:
+            break
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process and not args.test:
+        pipeline = StableVideoDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            image_encoder=accelerator.unwrap_model(image_encoder),
+            vae=accelerator.unwrap_model(vae),
+            unet=accelerator.unwrap_model(ema_unet) if args.use_ema else unet,
+            revision=args.revision,
+        )
+        pipeline.save_pretrained(args.output_dir)
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+    accelerator.end_training()
+if __name__ == "__main__":
+    main()

training/utils.py ADDED Viewed

	@@ -0,0 +1,509 @@

+import pickle
+from torch.utils.data import Dataset
+import cv2
+import argparse
+import glob
+import random
+import logging
+import torch
+import os
+import numpy as np
+import PIL
+from PIL import Image, ImageDraw
+from einops import rearrange
+from urllib.parse import urlparse
+from diffusers.utils import load_image
+import math
+# copy from https://github.com/crowsonkb/k-diffusion.git
+def rand_log_normal(shape, loc=0., scale=1., device='cpu', dtype=torch.float32):
+    """Draws samples from an lognormal distribution."""
+    u = torch.rand(shape, dtype=dtype, device=device) * (1 - 2e-7) + 1e-7
+    return torch.distributions.Normal(loc, scale).icdf(u).exp()
+def encode_image(pixel_values, feature_extractor, image_encoder, weight_dtype, accelerator):
+    # pixel: [-1, 1]
+    pixel_values = _resize_with_antialiasing(pixel_values, (224, 224))
+    # We unnormalize it after resizing.
+    pixel_values = (pixel_values + 1.0) / 2.0
+    # Normalize the image with for CLIP input
+    pixel_values = feature_extractor(
+        images=pixel_values,
+        do_normalize=True,
+        do_center_crop=False,
+        do_resize=False,
+        do_rescale=False,
+        return_tensors="pt",
+    ).pixel_values
+    pixel_values = pixel_values.to(
+        device=accelerator.device, dtype=weight_dtype)
+    image_embeddings = image_encoder(pixel_values).image_embeds
+    return image_embeddings
+def get_add_time_ids(
+    fps,
+    motion_bucket_id,
+    noise_aug_strength,
+    dtype,
+    batch_size,
+    unet
+):
+    add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+    passed_add_embed_dim = unet.config.addition_time_embed_dim * \
+        len(add_time_ids)
+    expected_add_embed_dim = unet.add_embedding.linear_1.in_features
+    if expected_add_embed_dim != passed_add_embed_dim:
+        raise ValueError(
+            f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+        )
+    add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+    add_time_ids = add_time_ids.repeat(batch_size, 1)
+    return add_time_ids
+def find_scale(height, width):
+    """
+    Finds a scale factor such that the number of pixels is less than 500,000
+    and the dimensions are rounded down to the nearest multiple of 64.
+    Args:
+        height (int): The original height of the image.
+        width (int): The original width of the image.
+    Returns:
+        tuple: The scaled height and width as integers.
+    """
+    max_pixels = 500000
+    # Start with no scaling
+    scale = 1.0
+    while True:
+        # Calculate the scaled dimensions
+        scaled_height = math.floor((height * scale) / 64) * 64
+        scaled_width = math.floor((width * scale) / 64) * 64
+        # Check if the scaled dimensions meet the pixel constraint
+        if scaled_height * scaled_width <= max_pixels:
+            return scaled_height, scaled_width
+        # Reduce the scale slightly
+        scale -= 0.01
+class OutsidePhotosDataset(Dataset):
+    def __init__(self, data_folder, width=1024, height=576, sample_frames=9):
+        self.data_folder = data_folder
+        self.scenes = sorted(glob.glob(os.path.join(data_folder, "*")))
+        #get images that end in .JPG,.jpg, .png
+        self.scenes = [scene for scene in self.scenes if scene.endswith(".JPG") or scene.endswith(".jpg") or scene.endswith(".png") or scene.endswith(".jpeg") or scene.endswith(".JPG")]
+        #make each scene a tuple anf for each scene, put it 9 times in the tuple - tuple should look like (scene_name, idx (0-8))
+        self.scenes = [(scene, idx) for scene in self.scenes for idx in range(9)]
+        self.num_scenes = len(self.scenes)
+        self.width = width
+        self.height = height
+        self.sample_frames = sample_frames
+        self.icc_profiles = [None]*self.num_scenes
+    def __len__(self):
+        return self.num_scenes
+    def __getitem__(self, idx):
+        #get the scene and the index
+        #create an empty tensor to store the pixel values and place the scene in the tensor (load and resize the image)
+        scene, focal_stack_num = self.scenes[idx]
+        with Image.open(scene) as img:
+            self.icc_profiles[idx] = img.info.get("icc_profile")
+            icc_profile = img.info.get("icc_profile")
+            if icc_profile is None:
+                icc_profile = "none"
+            original_pixels = torch.from_numpy(np.array(img)).float().permute(2,0,1)
+            original_pixels = original_pixels / 255
+            width, height = img.size
+            scaled_width, scaled_height = find_scale(width, height)
+            img_resized = img.resize((scaled_width, scaled_height))
+            img_tensor = torch.from_numpy(np.array(img_resized)).float()
+            img_normalized = img_tensor / 127.5 - 1
+            img_normalized = img_normalized.permute(2, 0, 1)
+            pixels = torch.zeros((self.sample_frames, 3, scaled_height, scaled_width))
+            pixels[focal_stack_num] = img_normalized
+            return {"pixel_values": pixels, "idx": idx//9, "focal_stack_num": focal_stack_num, "original_pixel_values": original_pixels, 'icc_profile': icc_profile}
+class FocalStackDataset(Dataset):
+    def __init__(self, data_folder: str, splits_dir, split="train", num_samples=100000, width=640, height=896, sample_frames=9): #4.5
+        #800*600 - 480000
+        #896*672 - 602112
+        """
+        Args:
+            num_samples (int): Number of samples in the dataset.
+            channels (int): Number of channels, default is 3 for RGB.
+        """
+        self.num_samples = num_samples
+        self.sample_frames = sample_frames
+        # Define the path to the folder containing video frames
+        self.data_folder = data_folder
+        self.splits_dir = splits_dir
+        size = "midsize"
+        # Use glob to find matching folders
+        # List to store the desired paths
+        rig_directories = []
+        # Walk through the directory
+        for root, dirs, files in os.walk(data_folder):
+            # Check if the path matches "downscaled/undistorted/Rig*"
+            for directory in dirs:
+                if directory.startswith("RigCenter") and f"{size}/undistorted" in root.replace("\\", "/"):
+                    rig_directory = os.path.join(root, directory)
+                    #check that rig_directory contains all 9 images
+                    if len(glob.glob(os.path.join(rig_directory, "*.jpg"))) == 9:
+                        rig_directories.append(rig_directory)
+        self.scenes = sorted(rig_directories) #sort the files by name
+        if split == "train":
+            #shuffle the scenes
+            random.shuffle(self.scenes)
+        self.split = split
+        debug = False
+        if debug:
+            self.scenes = self.scenes[50:60]
+        elif split == "train":
+            pkl_file = os.path.join(self.splits_dir, "train_scenes.pkl")
+            #load the train scenes
+            with open(pkl_file, "rb") as f:
+                pkl_scenes = pickle.load(f)
+            #only get scenes that are found in pkl file
+            self.scenes = [scene for scene in self.scenes if scene.split('/')[-4] in pkl_scenes]
+        elif split == "val":
+            pkl_file = os.path.join(self.splits_dir, "test_scenes.pkl") #use first 10 test scenes for val (just for visualization)
+            #load the test scenes
+            with open(pkl_file, "rb") as f:
+                pkl_scenes = pickle.load(f)
+            #only get scenes that are found in pkl file
+            self.scenes = [scene for scene in self.scenes if scene.split('/')[-4] in pkl_scenes]
+            self.scenes = self.scenes[:10]
+        else:
+            pkl_file = os.path.join(self.splits_dir, "test_scenes.pkl")
+            #load the test scenes
+            with open(pkl_file, "rb") as f:
+                pkl_scenes = pickle.load(f)
+            #only get scenes that are found in pkl file
+            self.scenes = [scene for scene in self.scenes if scene.split('/')[-4] in pkl_scenes]
+        if split == "test":
+            self.scenes = [(scene, idx) for scene in self.scenes for idx in range(self.sample_frames)]
+        self.num_scenes = len(self.scenes)
+        max_trdata = 0
+        if max_trdata > 0:
+            self.scenes = self.scenes[:max_trdata]
+        self.data_store = {}
+        logging.info(f'Creating {split} dataset with {self.num_scenes} examples')
+        self.channels = 3
+        self.width = width
+        self.height = height
+    def __len__(self):
+        return self.num_scenes
+    def __getitem__(self, idx):
+        """
+        Args:
+            idx (int): Index of the sample to return.
+        Returns:
+            dict: A dictionary containing the 'pixel_values' tensor of shape (16, channels, 320, 512).
+        """
+        # Randomly select a folder (representing a video) from the base folder
+        if self.split == "test":
+            chosen_folder, focal_stack_num = self.scenes[idx]
+        else:
+            chosen_folder = self.scenes[idx]
+        frames = os.listdir(chosen_folder)
+        #get only frames that are jpg
+        frames = [frame for frame in frames if frame.endswith(".jpg")]
+        # Sort the frames by name
+        frames.sort()
+        #Pad the frames list out
+        selected_frames = frames[:self.sample_frames]
+        # Initialize a tensor to store the pixel values
+        pixel_values = torch.empty((self.sample_frames, self.channels, self.height, self.width))
+        original_pixel_values = torch.empty((self.sample_frames, self.channels, 896, 640))
+        # Load and process each frame
+        for i, frame_name in enumerate(selected_frames):
+            frame_path = os.path.join(chosen_folder, frame_name)
+            with Image.open(frame_path) as img:
+                # Resize the image and convert it to a tensor
+                img_resized = img.resize((self.width, self.height))
+                img_tensor = torch.from_numpy(np.array(img_resized)).float()
+                original_img_tensor = torch.from_numpy(np.array(img)).float()
+                # Normalize the image by scaling pixel values to [-1, 1]
+                img_normalized = img_tensor / 127.5 - 1
+                original_img_normalized = original_img_tensor / 127.5 - 1
+                # Rearrange channels if necessary
+                if self.channels == 3:
+                    img_normalized = img_normalized.permute(
+                        2, 0, 1)  # For RGB images
+                    original_img_normalized = original_img_normalized.permute(2, 0, 1)
+                pixel_values[i] = img_normalized
+                original_pixel_values[i] = original_img_normalized
+        if self.sample_frames == 10: #special case for 10 frames where we duplicate the 9th frame (sometimes reduced color artifacts)
+            pixel_values[9] = pixel_values[8]
+            original_pixel_values[9] = original_pixel_values[8]
+        out_dict = {'pixel_values': pixel_values, "idx": idx, "original_pixel_values": original_pixel_values}
+        if self.split == "test":
+            out_dict["focal_stack_num"] = focal_stack_num
+            out_dict["idx"] = idx//9
+        return out_dict
+# resizing utils
+# TODO: clean up later
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+    input = _gaussian_blur2d(input, ks, sigmas)
+    output = torch.nn.functional.interpolate(
+        input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+    return out_padding
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(
+        device=input.device, dtype=input.dtype)
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+    height, width = tmp_kernel.shape[-2:]
+    padding_shape: list[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(
+        input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+    out = output.view(b, c, h, w)
+    return out
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+    batch_size = sigma.shape[0]
+    x = (torch.arange(window_size, device=sigma.device,
+         dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+    if window_size % 2 == 0:
+        x = x + 0.5
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+    return gauss / gauss.sum(-1, keepdim=True)
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+    return out
+def export_to_video(video_frames, output_video_path, fps):
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    h, w, _ = video_frames[0].shape
+    video_writer = cv2.VideoWriter(
+        output_video_path, fourcc, fps=fps, frameSize=(w, h))
+    for i in range(len(video_frames)):
+        img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
+        video_writer.write(img)
+def export_to_gif(frames, output_gif_path, fps):
+    """
+    Export a list of frames to a GIF.
+    Args:
+    - frames (list): List of frames (as numpy arrays or PIL Image objects).
+    - output_gif_path (str): Path to save the output GIF.
+    - duration_ms (int): Duration of each frame in milliseconds.
+    """
+    # Convert numpy arrays to PIL Images if needed
+    pil_frames = [Image.fromarray(frame) if isinstance(
+        frame, np.ndarray) else frame for frame in frames]
+    pil_frames[0].save(output_gif_path.replace('.mp4', '.gif'),
+                       format='GIF',
+                       append_images=pil_frames[1:],
+                       save_all=True,
+                       duration=500,
+                       loop=0)
+def tensor_to_vae_latent(t, vae, otype="sample"):
+    video_length = t.shape[1]
+    t = rearrange(t, "b f c h w -> (b f) c h w")
+    if otype == "sample":
+        latents = vae.encode(t).latent_dist.sample()
+    else:
+        latents = vae.encode(t).latent_dist.mode()
+    latents = rearrange(latents, "(b f) c h w -> b f c h w", f=video_length)
+    latents = latents * vae.config.scaling_factor
+    return latents
+import yaml
+def parse_config(config_path="config.yaml"):
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+    # handle distributed training rank
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != config.get("local_rank", -1):
+        config["local_rank"] = env_local_rank
+    # default fallback: non_ema_revision = revision
+    if config.get("non_ema_revision") is None:
+        config["non_ema_revision"] = config.get("revision")
+    return config
+def parse_args():
+    parser = argparse.ArgumentParser(description="SVD Training Script")
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="svd/scripts/training/configs/stage1_base.yaml",
+        help="Path to the config file.",
+    )
+    args = parser.parse_args()
+    # load YAML and merge into args
+    config = parse_config(args.config)
+    # combine yaml + command line args (command line has priority)
+    for k, v in vars(args).items():
+        if v is not None:
+            config[k] = v
+    # convert dict to argparse.Namespace for downstream compatibility
+    args = argparse.Namespace(**config)
+    print("OUTPUT DIR: ", args.output_dir)
+    return args
+def download_image(url):
+    original_image = (
+        lambda image_url_or_path: load_image(image_url_or_path)
+        if urlparse(image_url_or_path).scheme
+        else PIL.Image.open(image_url_or_path).convert("RGB")
+    )(url)
+    return original_image

training/validation.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from torchmetrics import MetricCollection
+from svd_pipeline import StableVideoDiffusionPipeline
+from accelerate.logging import get_logger
+import os
+from utils import load_image
+import torch
+import numpy as np
+import videoio
+import torchmetrics.image
+import matplotlib.image
+from PIL import Image
+logger = get_logger(__name__, log_level="INFO")
+def valid_net(args, val_dataset, val_dataloader, unet, image_encoder, vae, zero, accelerator, global_step, weight_dtype):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} videos."
+    )
+    # The models need unwrapping because for compatibility in distributed training mode.
+    pipeline = StableVideoDiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        unet=unet,
+        image_encoder=image_encoder,
+        vae=vae,
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    )
+    pipeline.set_progress_bar_config(disable=True)
+    # run inference
+    val_save_dir = os.path.join(
+        args.output_dir, "validation_images")
+    print("Validation images will be saved to ", val_save_dir)
+    os.makedirs(val_save_dir, exist_ok=True)
+    num_frames = args.num_frames
+    unet.eval()
+    with torch.autocast(
+        str(accelerator.device).replace(":0", ""), enabled=accelerator.mixed_precision == "fp16"
+    ):
+        for batch in val_dataloader:
+            #clear gradients (the torch no grad is the magic that makes this work)
+            with torch.no_grad():
+                torch.cuda.empty_cache()
+            pixel_values = batch["pixel_values"].to(accelerator.device)
+            original_pixel_values = batch['original_pixel_values'].to(accelerator.device)
+            idx = batch["idx"].to(accelerator.device)
+            if "focal_stack_num" in batch:
+                focal_stack_num = batch["focal_stack_num"][0].item()
+            else:
+                focal_stack_num = None
+            svd_output, gt_frames = pipeline(
+                pixel_values,
+                height=pixel_values.shape[3],
+                width=pixel_values.shape[4],
+                num_frames=args.num_frames,
+                decode_chunk_size=8,
+                motion_bucket_id=0 if args.conditioning != "ablate_time" else focal_stack_num,
+                min_guidance_scale=1.5,
+                max_guidance_scale=1.5,
+                reconstruction_guidance_scale=args.reconstruction_guidance,
+                fps=7,
+                noise_aug_strength=0,
+                accelerator=accelerator,
+                weight_dtype=weight_dtype,
+                conditioning = args.conditioning,
+                focal_stack_num = focal_stack_num,
+                zero=zero
+                # generator=generator,
+            )
+            video_frames = svd_output.frames[0]
+            gt_frames = gt_frames[0]
+            with torch.no_grad():
+                if args.num_frames == 10:
+                    #remove a frame at end from video_frames and gt_frames
+                    video_frames = video_frames[:, :-1]
+                    gt_frames = gt_frames[:, :-1]
+                    original_pixel_values = original_pixel_values[:, :-1]
+                if len(original_pixel_values.shape) == 5:
+                    pixel_values = original_pixel_values[0] #assuming batch size is 1
+                else:
+                    pixel_values = original_pixel_values.repeat(num_frames, 1, 1, 1)
+                pixel_values_normalized = pixel_values*0.5 + 0.5
+                pixel_values_normalized = torch.clamp(pixel_values_normalized,0,1)
+                video_frames_normalized = video_frames*0.5 + 0.5
+                video_frames_normalized = torch.clamp(video_frames_normalized,0,1)
+                video_frames_normalized = video_frames_normalized.permute(1,0,2,3)
+                gt_frames = torch.clamp(gt_frames,0,1)
+                gt_frames = gt_frames.permute(1,0,2,3)
+                #RESIZE images
+                video_frames_normalized = torch.nn.functional.interpolate(video_frames_normalized, ((pixel_values.shape[2]//2)*2, (pixel_values.shape[3]//2)*2), mode='bilinear')
+                gt_frames = torch.nn.functional.interpolate(gt_frames, ((pixel_values.shape[2]//2)*2, (pixel_values.shape[3]//2)*2), mode='bilinear')
+                pixel_values_normalized = torch.nn.functional.interpolate(pixel_values_normalized, ((pixel_values.shape[2]//2)*2, (pixel_values.shape[3]//2)*2), mode='bilinear')
+                os.makedirs(os.path.join(val_save_dir, f"position_{focal_stack_num}/videos"), exist_ok=True)
+                videoio.videosave(os.path.join(
+                    val_save_dir,
+                    f"position_{focal_stack_num}/videos/step_{global_step}_val_img_{idx[0].item()}.mp4",
+                ), video_frames_normalized.permute(0,2,3,1).cpu().numpy(), fps=5)
+                if args.test:
+                    #save images
+                    os.makedirs(os.path.join(val_save_dir, f"position_{focal_stack_num}/images"), exist_ok=True)
+                    if not args.photos:
+                        for i in range(num_frames):
+                            matplotlib.image.imsave(os.path.join(val_save_dir, f"position_{focal_stack_num}/images/img_{idx[0].item()}_frame_{i}.png"), video_frames_normalized[i].permute(1,2,0).cpu().numpy())
+                    else:
+                        for i in range(num_frames):
+                            #use Pillow to save images
+                            img = Image.fromarray((video_frames_normalized[i].permute(1,2,0).cpu().numpy()*255).astype(np.uint8))
+                            #use index to assign icc profile to img
+                            if batch['icc_profile'][0] != "none":
+                                img.info['icc_profile'] = batch['icc_profile'][0]
+                            img.save(os.path.join(val_save_dir, f"position_{focal_stack_num}/images/img_{idx[0].item()}_frame_{i}.png"))
+            del video_frames
+    accelerator.wait_for_everyone()
+    #clear gradients (the torch no grad is the magic that makes this work)
+    with torch.no_grad():
+        torch.cuda.empty_cache()
+    del pipeline
+    accelerator.wait_for_everyone() #this is really important and we need to make sure everyone is leaving at the same time