├── .github
    └── workflows
    │   ├── close_issue.yaml
    │   └── pages-deploy.yml
├── .gitignore
├── .isort.cfg
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── assets
    ├── demo
    │   ├── sample_16s_224x448.gif
    │   ├── sample_16s_320x320.gif
    │   ├── sample_16x240x426_9.gif
    │   ├── sample_32x240x426_7.gif
    │   ├── sample_32x480x854_9.gif
    │   ├── sora_16x240x426_26.gif
    │   ├── sora_16x240x426_27.gif
    │   ├── sora_16x240x426_40.gif
    │   ├── sora_16x426x240_24.gif
    │   ├── sora_16x426x240_3.gif
    │   └── v1.2
    │   │   ├── sample_0002.gif
    │   │   ├── sample_0004.gif
    │   │   ├── sample_0011.gif
    │   │   ├── sample_0013.gif
    │   │   ├── sample_0052.gif
    │   │   ├── sample_0061.gif
    │   │   ├── sample_0087.gif
    │   │   ├── sample_1718.gif
    │   │   └── sample_1719.gif
    ├── images
    │   ├── condition
    │   │   ├── cactus-happy.png
    │   │   ├── cactus-sad.png
    │   │   ├── cliff.png
    │   │   ├── ship.png
    │   │   ├── sunset1.png
    │   │   ├── sunset2.png
    │   │   └── wave.png
    │   ├── imagenet
    │   │   ├── train
    │   │   │   └── n01440764
    │   │   │   │   └── n01440764_10026.JPEG
    │   │   └── val
    │   │   │   └── n01440764
    │   │   │       └── ILSVRC2012_val_00000293.JPEG
    │   ├── ocr
    │   │   ├── demo_text_det.jpg
    │   │   ├── demo_text_ocr.jpg
    │   │   └── demo_text_recog.jpg
    │   └── watermark
    │   │   └── watermark.png
    ├── readme
    │   ├── colossal_ai.png
    │   ├── gradio_advanced.png
    │   ├── gradio_basic.png
    │   ├── gradio_option.png
    │   ├── icon.png
    │   ├── llava_vs_pllava_sample.gif
    │   ├── report-03_actions_count.png
    │   ├── report-03_objects_count.png
    │   ├── report-03_video_stats.png
    │   ├── report_3d_vae.png
    │   ├── report_arch.jpg
    │   ├── report_arch_comp.png
    │   ├── report_bucket.png
    │   ├── report_caption.png
    │   ├── report_data_pipeline.png
    │   ├── report_image_textlen.png
    │   ├── report_loss_curve_1.png
    │   ├── report_loss_curve_2.png
    │   ├── report_loss_curve_3.png
    │   ├── report_mask.png
    │   ├── report_mask_config.png
    │   ├── report_val_loss.png
    │   ├── report_vbench_score.png
    │   ├── report_vid_val_loss.png
    │   ├── report_video_duration.png
    │   ├── report_video_textlen.png
    │   ├── sample_0.gif
    │   ├── sample_1.gif
    │   ├── sample_2.gif
    │   ├── sample_3.gif
    │   ├── sample_4.gif
    │   ├── sample_5.gif
    │   └── sequence_parallelism.jpeg
    └── texts
    │   ├── VBench
    │       ├── all_category.txt
    │       ├── all_dimension.txt
    │       ├── all_i2v.txt
    │       ├── prompts_per_category
    │       │   ├── animal.txt
    │       │   ├── architecture.txt
    │       │   ├── food.txt
    │       │   ├── human.txt
    │       │   ├── lifestyle.txt
    │       │   ├── plant.txt
    │       │   ├── scenery.txt
    │       │   └── vehicles.txt
    │       └── prompts_per_dimension
    │       │   ├── appearance_style.txt
    │       │   ├── color.txt
    │       │   ├── human_action.txt
    │       │   ├── multiple_objects.txt
    │       │   ├── object_class.txt
    │       │   ├── overall_consistency.txt
    │       │   ├── scene.txt
    │       │   ├── spatial_relationship.txt
    │       │   ├── subject_consistency.txt
    │       │   ├── temporal_flickering.txt
    │       │   └── temporal_style.txt
    │   ├── imagenet_id.txt
    │   ├── imagenet_labels.txt
    │   ├── rand_types.txt
    │   ├── t2i_samples.txt
    │   ├── t2i_sigma.txt
    │   ├── t2v_car.txt
    │   ├── t2v_latte.txt
    │   ├── t2v_pllava.txt
    │   ├── t2v_ref.txt
    │   ├── t2v_samples.txt
    │   ├── t2v_short.txt
    │   ├── t2v_sora.txt
    │   ├── ucf101_id.txt
    │   └── ucf101_labels.txt
├── configs
    ├── dit
    │   ├── inference
    │   │   ├── 16x256x256.py
    │   │   ├── 1x256x256-class.py
    │   │   └── 1x256x256.py
    │   └── train
    │   │   ├── 16x256x256.py
    │   │   └── 1x256x256.py
    ├── latte
    │   ├── inference
    │   │   ├── 16x256x256-class.py
    │   │   └── 16x256x256.py
    │   └── train
    │   │   └── 16x256x256.py
    ├── opensora-v1-1
    │   ├── inference
    │   │   ├── sample-ref.py
    │   │   └── sample.py
    │   └── train
    │   │   ├── benchmark.py
    │   │   ├── image.py
    │   │   ├── image_rflow.py
    │   │   ├── stage1.py
    │   │   ├── stage2.py
    │   │   ├── stage3.py
    │   │   └── video.py
    ├── opensora-v1-2
    │   ├── inference
    │   │   ├── sample.py
    │   │   └── sample_hf.py
    │   ├── lambda
    │   │   ├── stage1.py
    │   │   ├── stage2.py
    │   │   ├── stage3.py
    │   │   ├── stage4.py
    │   │   ├── stage5.py
    │   │   └── stage6.py
    │   ├── misc
    │   │   ├── bs.py
    │   │   ├── eval_loss.py
    │   │   ├── extract.py
    │   │   └── feat.py
    │   └── train
    │   │   ├── adapt.py
    │   │   ├── demo_360p.py
    │   │   ├── demo_480p.py
    │   │   ├── stage1.py
    │   │   ├── stage1_feat.py
    │   │   ├── stage2.py
    │   │   └── stage3.py
    ├── opensora
    │   ├── inference
    │   │   ├── 16x256x256.py
    │   │   ├── 16x512x512-rflow.py
    │   │   ├── 16x512x512.py
    │   │   └── 64x512x512.py
    │   └── train
    │   │   ├── 16x256x256-mask.py
    │   │   ├── 16x256x256-spee-rflow.py
    │   │   ├── 16x256x256-spee.py
    │   │   ├── 16x256x256.py
    │   │   ├── 16x512x512.py
    │   │   ├── 360x512x512.py
    │   │   ├── 64x512x512-sp.py
    │   │   └── 64x512x512.py
    ├── pixart
    │   ├── inference
    │   │   ├── 16x256x256.py
    │   │   ├── 1x1024MS.py
    │   │   ├── 1x20481B.py
    │   │   ├── 1x2048MS.py
    │   │   ├── 1x256x256.py
    │   │   ├── 1x512x512-rflow.py
    │   │   └── 1x512x512.py
    │   └── train
    │   │   ├── 16x256x256.py
    │   │   ├── 1x2048x2048.py
    │   │   ├── 1x512x512-rflow.py
    │   │   ├── 1x512x512.py
    │   │   └── 64x512x512.py
    └── vae
    │   ├── inference
    │       ├── image.py
    │       └── video.py
    │   └── train
    │       ├── stage1.py
    │       ├── stage2.py
    │       └── stage3.py
├── docs
    ├── acceleration.md
    ├── commands.md
    ├── config.md
    ├── data_processing.md
    ├── datasets.md
    ├── installation.md
    ├── report_01.md
    ├── report_02.md
    ├── report_03.md
    ├── structure.md
    ├── tutorial
    │   ├── .nojekyll
    │   ├── Gemfile
    │   ├── _config.yml
    │   ├── _data
    │   │   ├── contact.yml
    │   │   ├── locales
    │   │   │   └── en-customized.yml
    │   │   └── share.yml
    │   ├── _includes
    │   │   ├── favicons.html
    │   │   ├── sidebar.html
    │   │   └── topbar.html
    │   ├── _plugins
    │   │   ├── details_tag.rb
    │   │   └── posts-lastmod-hook.rb
    │   ├── _posts
    │   │   └── .placeholder
    │   ├── _tabs
    │   │   ├── dataset.md
    │   │   ├── introduction.md
    │   │   ├── lessons.md
    │   │   ├── repository.md
    │   │   ├── setup.md
    │   │   └── training.md
    │   ├── assets
    │   │   ├── css
    │   │   │   ├── colors
    │   │   │   │   ├── typography-dark.scss
    │   │   │   │   └── typography-light.scss
    │   │   │   └── jekyll-theme-chirpy.scss
    │   │   ├── fails_loss.png
    │   │   ├── fails_weight_norm.png
    │   │   ├── img
    │   │   │   └── lambda-logo.svg
    │   │   ├── monitoring_tool.png
    │   │   └── pyspy_dump.png
    │   └── index.md
    ├── vae.md
    └── zh_CN
    │   ├── README.md
    │   ├── READMEv1.1.md
    │   ├── acceleration.md
    │   ├── commands.md
    │   ├── datasets.md
    │   ├── report_v1.md
    │   ├── report_v2.md
    │   ├── report_v3.md
    │   ├── structure.md
    │   └── vae.md
├── eval
    ├── README.md
    ├── human_eval
    │   ├── generate.sh
    │   └── launch.sh
    ├── loss
    │   ├── eval_loss.py
    │   ├── launch.sh
    │   └── tabulate_rl_loss.py
    ├── sample.sh
    ├── vae
    │   ├── cal_flolpips.py
    │   ├── cal_lpips.py
    │   ├── cal_psnr.py
    │   ├── cal_ssim.py
    │   ├── eval_common_metric.py
    │   ├── flolpips
    │   │   ├── correlation
    │   │   │   └── correlation.py
    │   │   ├── flolpips.py
    │   │   ├── pretrained_networks.py
    │   │   ├── pwcnet.py
    │   │   └── utils.py
    │   └── script
    │   │   └── eval.sh
    ├── vbench
    │   ├── VBench_full_info.json
    │   ├── calc_vbench.py
    │   ├── launch.sh
    │   ├── launch_calc.sh
    │   └── tabulate_vbench_scores.py
    └── vbench_i2v
    │   ├── calc_vbench_i2v.py
    │   ├── json_to_txt.py
    │   ├── launch.sh
    │   ├── launch_calc.sh
    │   ├── tabulate_vbench_i2v_scores.py
    │   └── vbench2_i2v_full_info.json
├── gradio
    ├── README.md
    ├── app.py
    └── requirements.txt
├── install-check-pytorch23.py
├── install-check.py
├── install-pytorch23.sh
├── install.sh
├── kill_process.sh
├── notebooks
    ├── inference.ipynb
    └── launch.ipynb
├── nvtop_all.py
├── opensora
    ├── __init__.py
    ├── acceleration
    │   ├── __init__.py
    │   ├── checkpoint.py
    │   ├── communications.py
    │   ├── parallel_states.py
    │   ├── plugin.py
    │   └── shardformer
    │   │   ├── __init__.py
    │   │   ├── modeling
    │   │       ├── __init__.py
    │   │       └── t5.py
    │   │   └── policy
    │   │       ├── __init__.py
    │   │       └── t5_encoder.py
    ├── datasets
    │   ├── __init__.py
    │   ├── aspect.py
    │   ├── bucket.py
    │   ├── dataloader.py
    │   ├── datasets.py
    │   ├── read_video.py
    │   ├── sampler.py
    │   ├── utils.py
    │   └── video_transforms.py
    ├── models
    │   ├── __init__.py
    │   ├── dit
    │   │   ├── __init__.py
    │   │   └── dit.py
    │   ├── latte
    │   │   ├── __init__.py
    │   │   └── latte.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   └── blocks.py
    │   ├── pixart
    │   │   ├── __init__.py
    │   │   ├── pixart.py
    │   │   └── pixart_sigma.py
    │   ├── stdit
    │   │   ├── __init__.py
    │   │   ├── stdit.py
    │   │   ├── stdit2.py
    │   │   └── stdit3.py
    │   ├── text_encoder
    │   │   ├── __init__.py
    │   │   ├── classes.py
    │   │   ├── clip.py
    │   │   └── t5.py
    │   └── vae
    │   │   ├── __init__.py
    │   │   ├── discriminator.py
    │   │   ├── losses.py
    │   │   ├── lpips.py
    │   │   ├── utils.py
    │   │   ├── vae.py
    │   │   ├── vae_temporal.py
    │   │   └── video_sdxl
    │   │       └── blocks.py
    ├── registry.py
    ├── schedulers
    │   ├── __init__.py
    │   ├── dpms
    │   │   ├── __init__.py
    │   │   └── dpm_solver.py
    │   ├── iddpm
    │   │   ├── __init__.py
    │   │   ├── diffusion_utils.py
    │   │   ├── gaussian_diffusion.py
    │   │   ├── respace.py
    │   │   ├── speed.py
    │   │   └── timestep_sampler.py
    │   └── rf
    │   │   ├── __init__.py
    │   │   └── rectified_flow.py
    └── utils
    │   ├── __init__.py
    │   ├── ckpt_utils.py
    │   ├── config_utils.py
    │   ├── inference_utils.py
    │   ├── lr_scheduler.py
    │   ├── misc.py
    │   └── train_utils.py
├── requirements
    ├── requirements-cu121.txt
    ├── requirements-data.txt
    ├── requirements-eval.txt
    ├── requirements-pllava.txt
    ├── requirements-vae.txt
    └── requirements.txt
├── scripts
    ├── clear_cache.sh
    ├── inference-server.py
    ├── inference.py
    ├── inference_vae.py
    ├── misc
    │   ├── extract_feat.py
    │   ├── launch_extract_feat.sh
    │   ├── launch_search_bs.sh
    │   ├── profile_train.py
    │   └── search_bs.py
    ├── train.py
    └── train_vae.py
├── setup.py
├── tests
    ├── test_attn.py
    ├── test_lr_scheduler.py
    ├── test_np_torch.py
    ├── test_pos_emb.py
    ├── test_seq_parallel_attention.py
    ├── test_stdit3_sequence_parallelism.py
    └── test_t5_shardformer.py
└── tools
    ├── __init__.py
    ├── caption
        ├── README.md
        ├── __init__.py
        ├── acceleration
        │   ├── __init__.py
        │   └── llava
        │   │   ├── __init__.py
        │   │   └── policies
        │   │       ├── __init__.py
        │   │       ├── llama.py
        │   │       └── mistral.py
        ├── camera_motion
        │   ├── __init__.py
        │   ├── camera_motion.py
        │   ├── detect.py
        │   ├── requirements.txt
        │   ├── utils.py
        │   └── visualizer.py
        ├── camera_motion_detect.py
        ├── caption_gpt4.py
        ├── caption_llama3.py
        ├── caption_llava.py
        ├── pllava_dir
        │   └── caption_pllava.py
        └── utils.py
    ├── datasets
        ├── README.md
        ├── __init__.py
        ├── analyze.py
        ├── convert.py
        ├── datautil.py
        ├── ffmpeg_check_parallel.sh
        ├── ffmpeg_filter_without_errors.py
        ├── filter_large_videos.py
        ├── filter_panda10m.py
        ├── split.py
        ├── transform.py
        └── utils.py
    ├── frame_interpolation
        ├── README.md
        ├── __init__.py
        ├── interpolation.py
        ├── networks
        │   ├── __init__.py
        │   ├── amt_g.py
        │   └── blocks
        │   │   ├── __init__.py
        │   │   ├── feat_enc.py
        │   │   ├── ifrnet.py
        │   │   ├── multi_flow.py
        │   │   └── raft.py
        └── utils
        │   ├── __init__.py
        │   ├── dist_utils.py
        │   ├── flow_utils.py
        │   └── utils.py
    ├── scene_cut
        ├── README.md
        ├── __init__.py
        ├── convert_id_to_path.py
        ├── cut.py
        └── scene_detect.py
    └── scoring
        ├── README.md
        ├── __init__.py
        ├── aesthetic
            ├── __init__.py
            └── inference.py
        ├── matching
            ├── __init__.py
            └── inference.py
        ├── ocr
            ├── __init__.py
            ├── dbnetpp.py
            └── inference.py
        └── optical_flow
            ├── __init__.py
            ├── inference.py
            └── unimatch
                ├── __init__.py
                ├── attention.py
                ├── backbone.py
                ├── geometry.py
                ├── matching.py
                ├── position.py
                ├── reg_refine.py
                ├── transformer.py
                ├── trident_conv.py
                ├── unimatch.py
                └── utils.py


/.github/workflows/close_issue.yaml:
--------------------------------------------------------------------------------
 1 | name: Close inactive issues
 2 | on:
 3 |   schedule:
 4 |     - cron: "30 1 * * *"
 5 | 
 6 | jobs:
 7 |   close-issues:
 8 |     runs-on: ubuntu-latest
 9 |     permissions:
10 |       issues: write
11 |       pull-requests: write
12 |     steps:
13 |       - uses: actions/stale@v9
14 |         with:
15 |           days-before-issue-stale: 7
16 |           days-before-issue-close: 7
17 |           stale-issue-label: "stale"
18 |           stale-issue-message: "This issue is stale because it has been open for 7 days with no activity."
19 |           close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale."
20 |           days-before-pr-stale: -1
21 |           days-before-pr-close: -1
22 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
23 | 


--------------------------------------------------------------------------------
/.github/workflows/pages-deploy.yml:
--------------------------------------------------------------------------------
 1 | name: "Deploy Tutorial"
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |     paths-ignore:
 7 |       - .gitignore
 8 |       - README.md
 9 |       - LICENSE
10 | 
11 |   # Allows you to run this workflow manually from the Actions tab
12 |   workflow_dispatch:
13 | 
14 | permissions:
15 |   contents: read
16 |   pages: write
17 |   id-token: write
18 | 
19 | # Allow one concurrent deployment
20 | concurrency:
21 |   group: "pages"
22 |   cancel-in-progress: true
23 | 
24 | jobs:
25 |   build:
26 |     runs-on: ubuntu-latest
27 | 
28 |     steps:
29 |       - name: Checkout
30 |         uses: actions/checkout@v4
31 |         with:
32 |           fetch-depth: 0
33 | 
34 |       - name: Setup Pages
35 |         id: pages
36 |         uses: actions/configure-pages@v4
37 | 
38 |       - name: Setup Ruby
39 |         uses: ruby/setup-ruby@v1
40 |         with:
41 |           ruby-version: 3.3
42 |           bundler-cache: true
43 | 
44 |       - name: Install Dependencies
45 |         run: |
46 |           cd ./docs/tutorial
47 |           bundle install
48 | 
49 |       - name: Build site
50 |         run: |
51 |           cd ./docs/tutorial
52 |           bundle exec jekyll b -d "_site"
53 |         env:
54 |           JEKYLL_ENV: "production"
55 | 
56 |       - name: Upload site artifact
57 |         uses: actions/upload-pages-artifact@v3
58 |         with:
59 |           path: "./docs/tutorial/_site"
60 | 
61 |   deploy:
62 |     environment:
63 |       name: github-pages
64 |       url: ${{ steps.deployment.outputs.page_url }}
65 |     runs-on: ubuntu-latest
66 |     needs: build
67 |     steps:
68 |       - name: Deploy to GitHub Pages
69 |         id: deployment
70 |         uses: actions/deploy-pages@v4
71 | 
72 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | line_length = 120
3 | multi_line_output=3
4 | include_trailing_comma = true
5 | ignore_comments = true
6 | profile = black
7 | honor_noqa = true
8 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | 
 3 |   - repo: https://github.com/PyCQA/autoflake
 4 |     rev: v2.2.1
 5 |     hooks:
 6 |       - id: autoflake
 7 |         name: autoflake (python)
 8 |         args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports']
 9 | 
10 |   - repo: https://github.com/pycqa/isort
11 |     rev: 5.12.0
12 |     hooks:
13 |       - id: isort
14 |         name: sort all imports (python)
15 | 
16 |   - repo: https://github.com/psf/black-pre-commit-mirror
17 |     rev: 23.9.1
18 |     hooks:
19 |     - id: black
20 |       name: black formatter
21 |       args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']
22 | 
23 |   - repo: https://github.com/pre-commit/pre-commit-hooks
24 |     rev: v4.3.0
25 |     hooks:
26 |       - id: check-yaml
27 |       - id: check-merge-conflict
28 |       - id: check-case-conflict
29 |       - id: trailing-whitespace
30 |       - id: end-of-file-fixer
31 |       - id: mixed-line-ending
32 |         args: ['--fix=lf']
33 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM hpcaitech/pytorch-cuda:2.1.0-12.1.0
 2 | 
 3 | # metainformation
 4 | LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/Open-Sora"
 5 | LABEL org.opencontainers.image.licenses = "Apache License 2.0"
 6 | LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/pytorch-cuda:2.1.0-12.1.0"
 7 | 
 8 | # Set the working directory
 9 | WORKDIR /workspace/Open-Sora
10 | # Copy the current directory contents into the container at /workspace/Open-Sora
11 | COPY . .
12 | 
13 | # inatall library dependencies
14 | RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
15 | 
16 | # install flash attention
17 | RUN pip install flash-attn --no-build-isolation
18 | 
19 | # install apex
20 | RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
21 | 
22 | # install xformers
23 | RUN pip install xformers --index-url https://download.pytorch.org/whl/cu121
24 | 
25 | # install this project
26 | RUN pip install -v .
27 | 


--------------------------------------------------------------------------------
/assets/demo/sample_16s_224x448.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sample_16s_224x448.gif


--------------------------------------------------------------------------------
/assets/demo/sample_16s_320x320.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sample_16s_320x320.gif


--------------------------------------------------------------------------------
/assets/demo/sample_16x240x426_9.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sample_16x240x426_9.gif


--------------------------------------------------------------------------------
/assets/demo/sample_32x240x426_7.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sample_32x240x426_7.gif


--------------------------------------------------------------------------------
/assets/demo/sample_32x480x854_9.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sample_32x480x854_9.gif


--------------------------------------------------------------------------------
/assets/demo/sora_16x240x426_26.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sora_16x240x426_26.gif


--------------------------------------------------------------------------------
/assets/demo/sora_16x240x426_27.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sora_16x240x426_27.gif


--------------------------------------------------------------------------------
/assets/demo/sora_16x240x426_40.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sora_16x240x426_40.gif


--------------------------------------------------------------------------------
/assets/demo/sora_16x426x240_24.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sora_16x426x240_24.gif


--------------------------------------------------------------------------------
/assets/demo/sora_16x426x240_3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sora_16x426x240_3.gif


--------------------------------------------------------------------------------
/assets/demo/v1.2/sample_0002.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0002.gif


--------------------------------------------------------------------------------
/assets/demo/v1.2/sample_0004.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0004.gif


--------------------------------------------------------------------------------
/assets/demo/v1.2/sample_0011.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0011.gif


--------------------------------------------------------------------------------
/assets/demo/v1.2/sample_0013.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0013.gif


--------------------------------------------------------------------------------
/assets/demo/v1.2/sample_0052.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0052.gif


--------------------------------------------------------------------------------
/assets/demo/v1.2/sample_0061.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0061.gif


--------------------------------------------------------------------------------
/assets/demo/v1.2/sample_0087.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0087.gif


--------------------------------------------------------------------------------
/assets/demo/v1.2/sample_1718.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_1718.gif


--------------------------------------------------------------------------------
/assets/demo/v1.2/sample_1719.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_1719.gif


--------------------------------------------------------------------------------
/assets/images/condition/cactus-happy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/cactus-happy.png


--------------------------------------------------------------------------------
/assets/images/condition/cactus-sad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/cactus-sad.png


--------------------------------------------------------------------------------
/assets/images/condition/cliff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/cliff.png


--------------------------------------------------------------------------------
/assets/images/condition/ship.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/ship.png


--------------------------------------------------------------------------------
/assets/images/condition/sunset1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/sunset1.png


--------------------------------------------------------------------------------
/assets/images/condition/sunset2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/sunset2.png


--------------------------------------------------------------------------------
/assets/images/condition/wave.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/wave.png


--------------------------------------------------------------------------------
/assets/images/imagenet/train/n01440764/n01440764_10026.JPEG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/imagenet/train/n01440764/n01440764_10026.JPEG


--------------------------------------------------------------------------------
/assets/images/imagenet/val/n01440764/ILSVRC2012_val_00000293.JPEG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/imagenet/val/n01440764/ILSVRC2012_val_00000293.JPEG


--------------------------------------------------------------------------------
/assets/images/ocr/demo_text_det.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/ocr/demo_text_det.jpg


--------------------------------------------------------------------------------
/assets/images/ocr/demo_text_ocr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/ocr/demo_text_ocr.jpg


--------------------------------------------------------------------------------
/assets/images/ocr/demo_text_recog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/ocr/demo_text_recog.jpg


--------------------------------------------------------------------------------
/assets/images/watermark/watermark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/watermark/watermark.png


--------------------------------------------------------------------------------
/assets/readme/colossal_ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/colossal_ai.png


--------------------------------------------------------------------------------
/assets/readme/gradio_advanced.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/gradio_advanced.png


--------------------------------------------------------------------------------
/assets/readme/gradio_basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/gradio_basic.png


--------------------------------------------------------------------------------
/assets/readme/gradio_option.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/gradio_option.png


--------------------------------------------------------------------------------
/assets/readme/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/icon.png


--------------------------------------------------------------------------------
/assets/readme/llava_vs_pllava_sample.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/llava_vs_pllava_sample.gif


--------------------------------------------------------------------------------
/assets/readme/report-03_actions_count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report-03_actions_count.png


--------------------------------------------------------------------------------
/assets/readme/report-03_objects_count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report-03_objects_count.png


--------------------------------------------------------------------------------
/assets/readme/report-03_video_stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report-03_video_stats.png


--------------------------------------------------------------------------------
/assets/readme/report_3d_vae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_3d_vae.png


--------------------------------------------------------------------------------
/assets/readme/report_arch.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_arch.jpg


--------------------------------------------------------------------------------
/assets/readme/report_arch_comp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_arch_comp.png


--------------------------------------------------------------------------------
/assets/readme/report_bucket.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_bucket.png


--------------------------------------------------------------------------------
/assets/readme/report_caption.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_caption.png


--------------------------------------------------------------------------------
/assets/readme/report_data_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_data_pipeline.png


--------------------------------------------------------------------------------
/assets/readme/report_image_textlen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_image_textlen.png


--------------------------------------------------------------------------------
/assets/readme/report_loss_curve_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_loss_curve_1.png


--------------------------------------------------------------------------------
/assets/readme/report_loss_curve_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_loss_curve_2.png


--------------------------------------------------------------------------------
/assets/readme/report_loss_curve_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_loss_curve_3.png


--------------------------------------------------------------------------------
/assets/readme/report_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_mask.png


--------------------------------------------------------------------------------
/assets/readme/report_mask_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_mask_config.png


--------------------------------------------------------------------------------
/assets/readme/report_val_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_val_loss.png


--------------------------------------------------------------------------------
/assets/readme/report_vbench_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_vbench_score.png


--------------------------------------------------------------------------------
/assets/readme/report_vid_val_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_vid_val_loss.png


--------------------------------------------------------------------------------
/assets/readme/report_video_duration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_video_duration.png


--------------------------------------------------------------------------------
/assets/readme/report_video_textlen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_video_textlen.png


--------------------------------------------------------------------------------
/assets/readme/sample_0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_0.gif


--------------------------------------------------------------------------------
/assets/readme/sample_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_1.gif


--------------------------------------------------------------------------------
/assets/readme/sample_2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_2.gif


--------------------------------------------------------------------------------
/assets/readme/sample_3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_3.gif


--------------------------------------------------------------------------------
/assets/readme/sample_4.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_4.gif


--------------------------------------------------------------------------------
/assets/readme/sample_5.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_5.gif


--------------------------------------------------------------------------------
/assets/readme/sequence_parallelism.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sequence_parallelism.jpeg


--------------------------------------------------------------------------------
/assets/texts/VBench/prompts_per_dimension/color.txt:
--------------------------------------------------------------------------------
 1 | a red bicycle
 2 | a green bicycle
 3 | a blue bicycle
 4 | a yellow bicycle
 5 | an orange bicycle
 6 | a purple bicycle
 7 | a pink bicycle
 8 | a black bicycle
 9 | a white bicycle
10 | a red car
11 | a green car
12 | a blue car
13 | a yellow car
14 | an orange car
15 | a purple car
16 | a pink car
17 | a black car
18 | a white car
19 | a red bird
20 | a green bird
21 | a blue bird
22 | a yellow bird
23 | an orange bird
24 | a purple bird
25 | a pink bird
26 | a black bird
27 | a white bird
28 | a black cat
29 | a white cat
30 | an orange cat
31 | a yellow cat
32 | a red umbrella
33 | a green umbrella
34 | a blue umbrella
35 | a yellow umbrella
36 | an orange umbrella
37 | a purple umbrella
38 | a pink umbrella
39 | a black umbrella
40 | a white umbrella
41 | a red suitcase
42 | a green suitcase
43 | a blue suitcase
44 | a yellow suitcase
45 | an orange suitcase
46 | a purple suitcase
47 | a pink suitcase
48 | a black suitcase
49 | a white suitcase
50 | a red bowl
51 | a green bowl
52 | a blue bowl
53 | a yellow bowl
54 | an orange bowl
55 | a purple bowl
56 | a pink bowl
57 | a black bowl
58 | a white bowl
59 | a red chair
60 | a green chair
61 | a blue chair
62 | a yellow chair
63 | an orange chair
64 | a purple chair
65 | a pink chair
66 | a black chair
67 | a white chair
68 | a red clock
69 | a green clock
70 | a blue clock
71 | a yellow clock
72 | an orange clock
73 | a purple clock
74 | a pink clock
75 | a black clock
76 | a white clock
77 | a red vase
78 | a green vase
79 | a blue vase
80 | a yellow vase
81 | an orange vase
82 | a purple vase
83 | a pink vase
84 | a black vase
85 | a white vase
86 | 


--------------------------------------------------------------------------------
/assets/texts/VBench/prompts_per_dimension/multiple_objects.txt:
--------------------------------------------------------------------------------
 1 | a bird and a cat
 2 | a cat and a dog
 3 | a dog and a horse
 4 | a horse and a sheep
 5 | a sheep and a cow
 6 | a cow and an elephant
 7 | an elephant and a bear
 8 | a bear and a zebra
 9 | a zebra and a giraffe
10 | a giraffe and a bird
11 | a chair and a couch
12 | a couch and a potted plant
13 | a potted plant and a tv
14 | a tv and a laptop
15 | a laptop and a remote
16 | a remote and a keyboard
17 | a keyboard and a cell phone
18 | a cell phone and a book
19 | a book and a clock
20 | a clock and a backpack
21 | a backpack and an umbrella
22 | an umbrella and a handbag
23 | a handbag and a tie
24 | a tie and a suitcase
25 | a suitcase and a vase
26 | a vase and scissors
27 | scissors and a teddy bear
28 | a teddy bear and a frisbee
29 | a frisbee and skis
30 | skis and a snowboard
31 | a snowboard and a sports ball
32 | a sports ball and a kite
33 | a kite and a baseball bat
34 | a baseball bat and a baseball glove
35 | a baseball glove and a skateboard
36 | a skateboard and a surfboard
37 | a surfboard and a tennis racket
38 | a tennis racket and a bottle
39 | a bottle and a chair
40 | an airplane and a train
41 | a train and a boat
42 | a boat and an airplane
43 | a bicycle and a car
44 | a car and a motorcycle
45 | a motorcycle and a bus
46 | a bus and a traffic light
47 | a traffic light and a fire hydrant
48 | a fire hydrant and a stop sign
49 | a stop sign and a parking meter
50 | a parking meter and a truck
51 | a truck and a bicycle
52 | a toilet and a hair drier
53 | a hair drier and a toothbrush
54 | a toothbrush and a sink
55 | a sink and a toilet
56 | a wine glass and a chair
57 | a cup and a couch
58 | a fork and a potted plant
59 | a knife and a tv
60 | a spoon and a laptop
61 | a bowl and a remote
62 | a banana and a keyboard
63 | an apple and a cell phone
64 | a sandwich and a book
65 | an orange and a clock
66 | broccoli and a backpack
67 | a carrot and an umbrella
68 | a hot dog and a handbag
69 | a pizza and a tie
70 | a donut and a suitcase
71 | a cake and a vase
72 | an oven and scissors
73 | a toaster and a teddy bear
74 | a microwave and a frisbee
75 | a refrigerator and skis
76 | a bicycle and an airplane
77 | a car and a train
78 | a motorcycle and a boat
79 | a person and a toilet
80 | a person and a hair drier
81 | a person and a toothbrush
82 | a person and a sink
83 | 


--------------------------------------------------------------------------------
/assets/texts/VBench/prompts_per_dimension/object_class.txt:
--------------------------------------------------------------------------------
 1 | a person
 2 | a bicycle
 3 | a car
 4 | a motorcycle
 5 | an airplane
 6 | a bus
 7 | a train
 8 | a truck
 9 | a boat
10 | a traffic light
11 | a fire hydrant
12 | a stop sign
13 | a parking meter
14 | a bench
15 | a bird
16 | a cat
17 | a dog
18 | a horse
19 | a sheep
20 | a cow
21 | an elephant
22 | a bear
23 | a zebra
24 | a giraffe
25 | a backpack
26 | an umbrella
27 | a handbag
28 | a tie
29 | a suitcase
30 | a frisbee
31 | skis
32 | a snowboard
33 | a sports ball
34 | a kite
35 | a baseball bat
36 | a baseball glove
37 | a skateboard
38 | a surfboard
39 | a tennis racket
40 | a bottle
41 | a wine glass
42 | a cup
43 | a fork
44 | a knife
45 | a spoon
46 | a bowl
47 | a banana
48 | an apple
49 | a sandwich
50 | an orange
51 | broccoli
52 | a carrot
53 | a hot dog
54 | a pizza
55 | a donut
56 | a cake
57 | a chair
58 | a couch
59 | a potted plant
60 | a bed
61 | a dining table
62 | a toilet
63 | a tv
64 | a laptop
65 | a remote
66 | a keyboard
67 | a cell phone
68 | a microwave
69 | an oven
70 | a toaster
71 | a sink
72 | a refrigerator
73 | a book
74 | a clock
75 | a vase
76 | scissors
77 | a teddy bear
78 | a hair drier
79 | a toothbrush
80 | 


--------------------------------------------------------------------------------
/assets/texts/VBench/prompts_per_dimension/scene.txt:
--------------------------------------------------------------------------------
 1 | alley
 2 | amusement park
 3 | aquarium
 4 | arch
 5 | art gallery
 6 | bathroom
 7 | bakery shop
 8 | ballroom
 9 | bar
10 | barn
11 | basement
12 | beach
13 | bedroom
14 | bridge
15 | botanical garden
16 | cafeteria
17 | campsite
18 | campus
19 | carrousel
20 | castle
21 | cemetery
22 | classroom
23 | cliff
24 | crosswalk
25 | construction site
26 | corridor
27 | courtyard
28 | desert
29 | downtown
30 | driveway
31 | farm
32 | food court
33 | football field
34 | forest road
35 | fountain
36 | gas station
37 | glacier
38 | golf course
39 | indoor gymnasium
40 | harbor
41 | highway
42 | hospital
43 | house
44 | iceberg
45 | industrial area
46 | jail cell
47 | junkyard
48 | kitchen
49 | indoor library
50 | lighthouse
51 | laboratory
52 | mansion
53 | marsh
54 | mountain
55 | indoor movie theater
56 | indoor museum
57 | music studio
58 | nursery
59 | ocean
60 | office
61 | palace
62 | parking lot
63 | pharmacy
64 | phone booth
65 | raceway
66 | restaurant
67 | river
68 | science museum
69 | shower
70 | ski slope
71 | sky
72 | skyscraper
73 | baseball stadium
74 | staircase
75 | street
76 | supermarket
77 | indoor swimming pool
78 | tower
79 | outdoor track
80 | train railway
81 | train station platform
82 | underwater coral reef
83 | valley
84 | volcano
85 | waterfall
86 | windmill
87 | 


--------------------------------------------------------------------------------
/assets/texts/imagenet_id.txt:
--------------------------------------------------------------------------------
1 | 207
2 | 360
3 | 387
4 | 974
5 | 88
6 | 979
7 | 417
8 | 279
9 | 


--------------------------------------------------------------------------------
/assets/texts/imagenet_labels.txt:
--------------------------------------------------------------------------------
1 | golden retriever
2 | otter
3 | lesser panda
4 | geyser
5 | macaw
6 | valley
7 | balloon
8 | golden panda
9 | 


--------------------------------------------------------------------------------
/assets/texts/rand_types.txt:
--------------------------------------------------------------------------------
 1 | 随机电影镜头
 2 | 随机电影镜头
 3 | 随机电影镜头
 4 | 随机电影镜头
 5 | 随机电影镜头
 6 | 随机任务镜头
 7 | 随机任务镜头
 8 | 随机任务镜头
 9 | 随机任务镜头
10 | 随机任务镜头
11 | 随机游戏镜头
12 | 随机游戏镜头
13 | 随机游戏镜头
14 | 随机游戏镜头
15 | 随机游戏镜头
16 | 随机开车镜头
17 | 随机开车镜头
18 | 随机开车镜头
19 | 随机开车镜头
20 | 随机开车镜头
21 | 随机动物镜头
22 | 随机动物镜头
23 | 随机动物镜头
24 | 随机动物镜头
25 | 随机动物镜头
26 | 随机森林镜头
27 | 随机森林镜头
28 | 随机森林镜头
29 | 随机森林镜头
30 | 随机森林镜头
31 | 随机动漫镜头
32 | 随机动漫镜头
33 | 随机动漫镜头
34 | 随机动漫镜头
35 | 随机动漫镜头
36 | 随机舞蹈镜头
37 | 随机舞蹈镜头
38 | 随机舞蹈镜头
39 | 随机舞蹈镜头
40 | 随机舞蹈镜头
41 | 


--------------------------------------------------------------------------------
/assets/texts/t2i_samples.txt:
--------------------------------------------------------------------------------
1 | A small cactus with a happy face in the Sahara desert.
2 | Bright scene, aerial view,ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens.
3 | Nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph.
4 | Poster of a mechanical cat, techical Schematics viewed from front.
5 | Luffy from ONEPIECE, handsome face, fantasy.
6 | Real beautiful woman.
7 | A alpaca made of colorful building blocks, cyberpunk.
8 | artistic
9 | 


--------------------------------------------------------------------------------
/assets/texts/t2i_sigma.txt:
--------------------------------------------------------------------------------
 1 | Eiffel Tower was Made up of more than 2 million translucent straws to look like a cloud, with the bell tower at the top of the building, Michel installed huge foam-making machines in the forest to blow huge amounts of unpredictable wet clouds in the building's classic architecture.
 2 | A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures.
 3 | Full body shot, a French woman, Photography, French Streets background, backlighting, rim light, Fujifilm.
 4 | Close-up photos of models, hazy light and shadow, laser metal hair accessories, soft and beautiful, light gold pupils, white eyelashes, low saturation, real skin details, clear pores and fine lines, light reflection and refraction, ultra-clear, cinematography, award-winning works.
 5 | A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in.
 6 | Lego model, future rocket station, intricate details, high resolution, unreal engine, UHD
 7 | One giant, sharp, metal square mirror in the center of the frame, four young people on the foreground, background sunny palm oil planation, tropical, realistic style, photography, nostalgic, green tone, mysterious, dreamy, bright color.
 8 | Modern luxury contemporary luxury home interiors house, in the style of mimicking ruined materials, ray tracing, haunting houses, and stone, capture the essence of nature, gray and bronze, dynamic outdoor shots.
 9 | Over the shoulder game perspective, game screen of Diablo 4, Inside the gorgeous palace is the wet ground, The necromancer knelt before the king, and a horde of skeletons he summoned stood at his side, cinematic light.
10 | A curvy timber house near a sea, designed by Zaha Hadid, represent the image of a cold, modern architecture, at night, white lighting, highly detailed.
11 | 


--------------------------------------------------------------------------------
/assets/texts/t2v_car.txt:
--------------------------------------------------------------------------------
1 | |0|A car driving on the in forest.|2|A car driving in the desert.|4|A car driving near the coast.|6|A car driving in the city.|8|A car driving near a mountain.|10|A car driving on the surface of a river.|12|A car driving on the surface of the earch.|14|A car driving in the universe.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,16,0.4"}
2 | 


--------------------------------------------------------------------------------
/assets/texts/t2v_latte.txt:
--------------------------------------------------------------------------------
1 | Yellow and black tropical fish dart through the sea.
2 | An epic tornado attacking above aglowing city at night.
3 | Slow pan upward of blazing oak fire in an indoor fireplace.
4 | a cat wearing sunglasses and working as a lifeguard at pool.
5 | Sunset over the sea.
6 | A dog in astronaut suit and sunglasses floating in space.
7 | A astronaut in flying in space, 4k, high resolution
8 | 


--------------------------------------------------------------------------------
/assets/texts/t2v_ref.txt:
--------------------------------------------------------------------------------
1 | Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
2 | In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.
3 | Pirate ship in a cosmic maelstrom nebula.
4 | Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
5 | A sad small cactus with in the Sahara desert becomes happy.
6 | A car driving on a road in the middle of a desert.
7 | 


--------------------------------------------------------------------------------
/assets/texts/t2v_short.txt:
--------------------------------------------------------------------------------
 1 | A fat rabbit wearing a purple robe walking through a fantasy landscape
 2 | Waves crashing against a lone lighthouse, ominous lighting
 3 | A mystical forest showcasing the adventures of travelers who enter
 4 | A blue-haired mage singing
 5 | A surreal landscape with floating islands and waterfalls in the sky craft
 6 | A blue bird standing in water
 7 | A young man walks alone by the seaside
 8 | Pink rose on a glass surface with droplets, close-up
 9 | Drove viewpoint, a subway train coming out of a tunnel
10 | Space with all planets green and pink color with background of bright white stars
11 | A city floating in an astral space, with stars and nebulae
12 | Sunrise on top of a high-rise building
13 | Pink and cyan powder explosions
14 | Deers in the woods gaze into the camera under the sunlight
15 | In a flash of lightning, a wizard appeared from thin air, his long robes billowing in the wind
16 | A futuristic cyberpunk cityscape at night with towering neon-lit skyscrapers
17 | A scene where the trees, flowers, and animals come together to create a symphony of nature
18 | A ghostly ship sailing through the clouds, navigating through a sea under a moonlit sky
19 | A sunset with beautiful beach
20 | A young man walking alone in the forest
21 | 


--------------------------------------------------------------------------------
/assets/texts/ucf101_id.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 1
3 | 2
4 | 3
5 | 4
6 | 5
7 | 


--------------------------------------------------------------------------------
/assets/texts/ucf101_labels.txt:
--------------------------------------------------------------------------------
1 | Apply Eye Makeup
2 | Apply Lipstick
3 | Archery
4 | Baby Crawling
5 | Balance Beam
6 | Band Marching
7 | 


--------------------------------------------------------------------------------
/configs/dit/inference/16x256x256.py:
--------------------------------------------------------------------------------
 1 | num_frames = 16
 2 | fps = 8
 3 | image_size = (256, 256)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="DiT-XL/2",
 8 |     condition="text",
 9 |     from_pretrained="PRETRAINED_MODEL",
10 | )
11 | vae = dict(
12 |     type="VideoAutoencoderKL",
13 |     from_pretrained="stabilityai/sd-vae-ft-ema",
14 | )
15 | text_encoder = dict(
16 |     type="clip",
17 |     from_pretrained="openai/clip-vit-base-patch32",
18 |     model_max_length=77,
19 | )
20 | scheduler = dict(
21 |     type="dpm-solver",
22 |     num_sampling_steps=20,
23 |     cfg_scale=4.0,
24 | )
25 | dtype = "bf16"
26 | 
27 | # Others
28 | batch_size = 2
29 | seed = 42
30 | prompt_path = "./assets/texts/ucf101_labels.txt"
31 | save_dir = "./samples/samples/"
32 | 


--------------------------------------------------------------------------------
/configs/dit/inference/1x256x256-class.py:
--------------------------------------------------------------------------------
 1 | num_frames = 1
 2 | fps = 1
 3 | image_size = (256, 256)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="DiT-XL/2",
 8 |     no_temporal_pos_emb=True,
 9 |     condition="label_1000",
10 |     from_pretrained="DiT-XL-2-256x256.pt",
11 | )
12 | vae = dict(
13 |     type="VideoAutoencoderKL",
14 |     from_pretrained="stabilityai/sd-vae-ft-ema",
15 | )
16 | text_encoder = dict(
17 |     type="classes",
18 |     num_classes=1000,
19 | )
20 | scheduler = dict(
21 |     type="dpm-solver",
22 |     num_sampling_steps=20,
23 |     cfg_scale=4.0,
24 | )
25 | dtype = "bf16"
26 | 
27 | # Others
28 | batch_size = 2
29 | seed = 42
30 | prompt_path = "./assets/texts/imagenet_id.txt"
31 | save_dir = "./samples/samples/"
32 | 


--------------------------------------------------------------------------------
/configs/dit/inference/1x256x256.py:
--------------------------------------------------------------------------------
 1 | num_frames = 1
 2 | fps = 1
 3 | image_size = (256, 256)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="DiT-XL/2",
 8 |     no_temporal_pos_emb=True,
 9 |     condition="text",
10 |     from_pretrained="PRETRAINED_MODEL",
11 | )
12 | vae = dict(
13 |     type="VideoAutoencoderKL",
14 |     from_pretrained="stabilityai/sd-vae-ft-ema",
15 | )
16 | text_encoder = dict(
17 |     type="clip",
18 |     from_pretrained="openai/clip-vit-base-patch32",
19 |     model_max_length=77,
20 | )
21 | scheduler = dict(
22 |     type="dpm-solver",
23 |     num_sampling_steps=20,
24 |     cfg_scale=4.0,
25 | )
26 | dtype = "bf16"
27 | 
28 | # Others
29 | batch_size = 2
30 | seed = 42
31 | prompt_path = "./assets/texts/imagenet_labels.txt"
32 | save_dir = "./samples/samples/"
33 | 


--------------------------------------------------------------------------------
/configs/dit/train/16x256x256.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=16,
 6 |     frame_interval=3,
 7 |     image_size=(256, 256),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="DiT-XL/2",
20 |     from_pretrained="DiT-XL-2-256x256.pt",
21 |     enable_flash_attn=True,
22 |     enable_layernorm_kernel=True,
23 | )
24 | vae = dict(
25 |     type="VideoAutoencoderKL",
26 |     from_pretrained="stabilityai/sd-vae-ft-ema",
27 | )
28 | text_encoder = dict(
29 |     type="clip",
30 |     from_pretrained="openai/clip-vit-base-patch32",
31 |     model_max_length=77,
32 | )
33 | scheduler = dict(
34 |     type="iddpm",
35 |     timestep_respacing="",
36 | )
37 | 
38 | # Others
39 | seed = 42
40 | outputs = "outputs"
41 | wandb = False
42 | 
43 | epochs = 1000
44 | log_every = 10
45 | ckpt_every = 1000
46 | load = None
47 | 
48 | batch_size = 8
49 | lr = 2e-5
50 | grad_clip = 1.0
51 | 


--------------------------------------------------------------------------------
/configs/dit/train/1x256x256.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=1,
 6 |     frame_interval=1,
 7 |     image_size=(256, 256),
 8 |     transform_name="center",
 9 | )
10 | 
11 | # Define acceleration
12 | num_workers = 4
13 | dtype = "bf16"
14 | grad_checkpoint = False
15 | plugin = "zero2"
16 | sp_size = 1
17 | 
18 | # Define model
19 | model = dict(
20 |     type="DiT-XL/2",
21 |     no_temporal_pos_emb=True,
22 |     enable_flash_attn=True,
23 |     enable_layernorm_kernel=True,
24 | )
25 | vae = dict(
26 |     type="VideoAutoencoderKL",
27 |     from_pretrained="stabilityai/sd-vae-ft-ema",
28 | )
29 | text_encoder = dict(
30 |     type="clip",
31 |     from_pretrained="openai/clip-vit-base-patch32",
32 |     model_max_length=77,
33 | )
34 | scheduler = dict(
35 |     type="iddpm",
36 |     timestep_respacing="",
37 | )
38 | 
39 | # Others
40 | seed = 42
41 | outputs = "outputs"
42 | wandb = False
43 | 
44 | epochs = 1000
45 | log_every = 10
46 | ckpt_every = 1000
47 | load = None
48 | 
49 | batch_size = 128
50 | lr = 1e-4  # according to DiT repo
51 | grad_clip = 1.0
52 | 


--------------------------------------------------------------------------------
/configs/latte/inference/16x256x256-class.py:
--------------------------------------------------------------------------------
 1 | num_frames = 16
 2 | fps = 8
 3 | image_size = (256, 256)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="Latte-XL/2",
 8 |     condition="label_101",
 9 |     from_pretrained="Latte-XL-2-256x256-ucf101.pt",
10 | )
11 | vae = dict(
12 |     type="VideoAutoencoderKL",
13 |     from_pretrained="stabilityai/sd-vae-ft-ema",
14 | )
15 | text_encoder = dict(
16 |     type="classes",
17 |     num_classes=101,
18 | )
19 | scheduler = dict(
20 |     type="dpm-solver",
21 |     num_sampling_steps=20,
22 |     cfg_scale=4.0,
23 | )
24 | dtype = "bf16"
25 | 
26 | # Others
27 | batch_size = 2
28 | seed = 42
29 | prompt_path = "./assets/texts/ucf101_id.txt"
30 | save_dir = "./samples/samples/"
31 | 


--------------------------------------------------------------------------------
/configs/latte/inference/16x256x256.py:
--------------------------------------------------------------------------------
 1 | num_frames = 16
 2 | fps = 8
 3 | image_size = (256, 256)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="Latte-XL/2",
 8 |     condition="text",
 9 |     from_pretrained="PRETRAINED_MODEL",
10 | )
11 | vae = dict(
12 |     type="VideoAutoencoderKL",
13 |     from_pretrained="stabilityai/sd-vae-ft-ema",
14 | )
15 | text_encoder = dict(
16 |     type="clip",
17 |     from_pretrained="openai/clip-vit-base-patch32",
18 |     model_max_length=77,
19 | )
20 | scheduler = dict(
21 |     type="dpm-solver",
22 |     num_sampling_steps=20,
23 |     cfg_scale=4.0,
24 | )
25 | dtype = "bf16"
26 | 
27 | # Others
28 | batch_size = 2
29 | seed = 42
30 | prompt_path = "./assets/texts/ucf101_labels.txt"
31 | save_dir = "./samples/samples/"
32 | 


--------------------------------------------------------------------------------
/configs/latte/train/16x256x256.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=16,
 6 |     frame_interval=3,
 7 |     image_size=(256, 256),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="Latte-XL/2",
20 |     enable_flash_attn=True,
21 |     enable_layernorm_kernel=True,
22 | )
23 | vae = dict(
24 |     type="VideoAutoencoderKL",
25 |     from_pretrained="stabilityai/sd-vae-ft-ema",
26 | )
27 | text_encoder = dict(
28 |     type="clip",
29 |     from_pretrained="openai/clip-vit-base-patch32",
30 |     model_max_length=77,
31 | )
32 | scheduler = dict(
33 |     type="iddpm",
34 |     timestep_respacing="",
35 | )
36 | 
37 | # Others
38 | seed = 42
39 | outputs = "outputs"
40 | wandb = False
41 | 
42 | epochs = 1000
43 | log_every = 10
44 | ckpt_every = 1000
45 | load = None
46 | 
47 | batch_size = 8
48 | lr = 2e-5
49 | grad_clip = 1.0
50 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-1/inference/sample.py:
--------------------------------------------------------------------------------
 1 | num_frames = 16
 2 | frame_interval = 3
 3 | fps = 24
 4 | image_size = (240, 426)
 5 | multi_resolution = "STDiT2"
 6 | 
 7 | # Define model
 8 | model = dict(
 9 |     type="STDiT2-XL/2",
10 |     from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
11 |     input_sq_size=512,
12 |     qk_norm=True,
13 |     qk_norm_legacy=True,
14 |     enable_flash_attn=True,
15 |     enable_layernorm_kernel=True,
16 | )
17 | vae = dict(
18 |     type="VideoAutoencoderKL",
19 |     from_pretrained="stabilityai/sd-vae-ft-ema",
20 |     cache_dir=None,  # "/mnt/hdd/cached_models",
21 |     micro_batch_size=4,
22 | )
23 | text_encoder = dict(
24 |     type="t5",
25 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
26 |     cache_dir=None,  # "/mnt/hdd/cached_models",
27 |     model_max_length=200,
28 | )
29 | scheduler = dict(
30 |     type="iddpm",
31 |     num_sampling_steps=100,
32 |     cfg_scale=7.0,
33 |     cfg_channel=3,  # or None
34 | )
35 | dtype = "bf16"
36 | 
37 | # Condition
38 | prompt_path = "./assets/texts/t2v_samples.txt"
39 | prompt = None  # prompt has higher priority than prompt_path
40 | 
41 | # Others
42 | batch_size = 1
43 | seed = 42
44 | save_dir = "./samples/samples/"
45 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-1/train/image.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=None,
 6 |     frame_interval=3,
 7 |     image_size=(None, None),
 8 |     transform_name="resize_crop",
 9 | )
10 | bucket_config = {  # 6s/it
11 |     "256": {1: (1.0, 256)},
12 |     "512": {1: (1.0, 80)},
13 |     "480p": {1: (1.0, 52)},
14 |     "1024": {1: (1.0, 20)},
15 |     "1080p": {1: (1.0, 8)},
16 | }
17 | 
18 | # Define acceleration
19 | num_workers = 4
20 | num_bucket_build_workers = 16
21 | dtype = "bf16"
22 | grad_checkpoint = True
23 | plugin = "zero2"
24 | sp_size = 1
25 | 
26 | # Define model
27 | model = dict(
28 |     type="STDiT2-XL/2",
29 |     from_pretrained=None,
30 |     input_sq_size=512,  # pretrained model is trained on 512x512
31 |     qk_norm=True,
32 |     qk_norm_legacy=True,
33 |     enable_flash_attn=True,
34 |     enable_layernorm_kernel=True,
35 | )
36 | vae = dict(
37 |     type="VideoAutoencoderKL",
38 |     from_pretrained="stabilityai/sd-vae-ft-ema",
39 |     micro_batch_size=4,
40 |     local_files_only=True,
41 | )
42 | text_encoder = dict(
43 |     type="t5",
44 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
45 |     model_max_length=200,
46 |     shardformer=True,
47 |     local_files_only=True,
48 | )
49 | scheduler = dict(
50 |     type="iddpm",
51 |     timestep_respacing="",
52 | )
53 | 
54 | # Others
55 | seed = 42
56 | outputs = "outputs"
57 | wandb = False
58 | 
59 | epochs = 1000
60 | log_every = 10
61 | ckpt_every = 500
62 | load = None
63 | 
64 | batch_size = 10  # only for logging
65 | lr = 2e-5
66 | grad_clip = 1.0
67 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-1/train/image_rflow.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | # dataset = dict(
 3 | #     type="VariableVideoTextDataset",
 4 | #     data_path=None,
 5 | #     num_frames=None,
 6 | #     frame_interval=3,
 7 | #     image_size=(None, None),
 8 | #     transform_name="resize_crop",
 9 | # )
10 | dataset = dict(
11 |     type="VideoTextDataset",
12 |     data_path=None,
13 |     num_frames=1,
14 |     frame_interval=1,
15 |     image_size=(256, 256),
16 |     transform_name="center",
17 | )
18 | bucket_config = {  # 6s/it
19 |     "256": {1: (1.0, 256)},
20 |     "512": {1: (1.0, 80)},
21 |     "480p": {1: (1.0, 52)},
22 |     "1024": {1: (1.0, 20)},
23 |     "1080p": {1: (1.0, 8)},
24 | }
25 | 
26 | # Define acceleration
27 | num_workers = 16
28 | dtype = "bf16"
29 | grad_checkpoint = True
30 | plugin = "zero2"
31 | sp_size = 1
32 | 
33 | # Define model
34 | # model = dict(
35 | #     type="DiT-XL/2",
36 | #     from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
37 | #     # input_sq_size=512,  # pretrained model is trained on 512x512
38 | #     enable_flash_attn=True,
39 | #     enable_layernorm_kernel=True,
40 | # )
41 | model = dict(
42 |     type="PixArt-XL/2",
43 |     space_scale=1.0,
44 |     time_scale=1.0,
45 |     no_temporal_pos_emb=True,
46 |     from_pretrained="PixArt-XL-2-512x512.pth",
47 |     enable_flash_attn=True,
48 |     enable_layernorm_kernel=True,
49 | )
50 | # model = dict(
51 | #     type="DiT-XL/2",
52 | #     # space_scale=1.0,
53 | #     # time_scale=1.0,
54 | #     no_temporal_pos_emb=True,
55 | #     # from_pretrained="PixArt-XL-2-512x512.pth",
56 | #     from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
57 | #     enable_flash_attn=True,
58 | #     enable_layernorm_kernel=True,
59 | # )
60 | vae = dict(
61 |     type="VideoAutoencoderKL",
62 |     from_pretrained="stabilityai/sd-vae-ft-ema",
63 |     micro_batch_size=4,
64 | )
65 | text_encoder = dict(
66 |     type="t5",
67 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
68 |     model_max_length=200,
69 |     shardformer=True,
70 | )
71 | scheduler = dict(
72 |     type="rflow",
73 |     # timestep_respacing="",
74 | )
75 | 
76 | # Others
77 | seed = 42
78 | outputs = "outputs"
79 | wandb = False
80 | 
81 | epochs = 10
82 | log_every = 10
83 | ckpt_every = 500
84 | load = None
85 | 
86 | batch_size = 100  # only for logging
87 | lr = 2e-5
88 | grad_clip = 1.0
89 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-1/train/stage1.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=None,
 6 |     frame_interval=3,
 7 |     image_size=(None, None),
 8 |     transform_name="resize_crop",
 9 | )
10 | # IMG: 1024 (20%) 512 (30%) 256 (50%) drop (50%)
11 | bucket_config = {  # 1s/it
12 |     "144p": {1: (0.5, 48), 16: (1.0, 6), 32: (1.0, 3), 96: (1.0, 1)},
13 |     "256": {1: (0.5, 24), 16: (0.5, 3), 48: (0.5, 1), 64: (0.0, None)},
14 |     "240p": {16: (0.3, 2), 32: (0.3, 1), 64: (0.0, None)},
15 |     "512": {1: (0.4, 12)},
16 |     "1024": {1: (0.3, 3)},
17 | }
18 | mask_ratios = {
19 |     "identity": 0.75,
20 |     "quarter_random": 0.025,
21 |     "quarter_head": 0.025,
22 |     "quarter_tail": 0.025,
23 |     "quarter_head_tail": 0.05,
24 |     "image_random": 0.025,
25 |     "image_head": 0.025,
26 |     "image_tail": 0.025,
27 |     "image_head_tail": 0.05,
28 | }
29 | 
30 | # Define acceleration
31 | num_workers = 8
32 | num_bucket_build_workers = 16
33 | dtype = "bf16"
34 | grad_checkpoint = False
35 | plugin = "zero2"
36 | sp_size = 1
37 | 
38 | # Define model
39 | model = dict(
40 |     type="STDiT2-XL/2",
41 |     from_pretrained=None,
42 |     input_sq_size=512,  # pretrained model is trained on 512x512
43 |     qk_norm=True,
44 |     qk_norm_legacy=True,
45 |     enable_flash_attn=True,
46 |     enable_layernorm_kernel=True,
47 | )
48 | vae = dict(
49 |     type="VideoAutoencoderKL",
50 |     from_pretrained="stabilityai/sd-vae-ft-ema",
51 |     micro_batch_size=4,
52 |     local_files_only=True,
53 | )
54 | text_encoder = dict(
55 |     type="t5",
56 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
57 |     model_max_length=200,
58 |     shardformer=True,
59 |     local_files_only=True,
60 | )
61 | scheduler = dict(
62 |     type="iddpm",
63 |     timestep_respacing="",
64 | )
65 | 
66 | # Others
67 | seed = 42
68 | outputs = "outputs"
69 | wandb = False
70 | 
71 | epochs = 1000
72 | log_every = 10
73 | ckpt_every = 500
74 | load = None
75 | 
76 | batch_size = None
77 | lr = 2e-5
78 | grad_clip = 1.0
79 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-1/train/stage2.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=None,
 6 |     frame_interval=3,
 7 |     image_size=(None, None),
 8 |     transform_name="resize_crop",
 9 | )
10 | bucket_config = {  # 7s/it
11 |     "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)},
12 |     "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)},
13 |     "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)},
14 |     "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)},
15 |     "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)},
16 |     "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)},
17 |     "1024": {1: (0.3, 20)},
18 |     "1080p": {1: (0.4, 8)},
19 | }
20 | mask_ratios = {
21 |     "identity": 0.75,
22 |     "quarter_random": 0.025,
23 |     "quarter_head": 0.025,
24 |     "quarter_tail": 0.025,
25 |     "quarter_head_tail": 0.05,
26 |     "image_random": 0.025,
27 |     "image_head": 0.025,
28 |     "image_tail": 0.025,
29 |     "image_head_tail": 0.05,
30 | }
31 | 
32 | # Define acceleration
33 | num_workers = 8
34 | num_bucket_build_workers = 16
35 | dtype = "bf16"
36 | grad_checkpoint = True
37 | plugin = "zero2"
38 | sp_size = 1
39 | 
40 | # Define model
41 | model = dict(
42 |     type="STDiT2-XL/2",
43 |     from_pretrained=None,
44 |     input_sq_size=512,  # pretrained model is trained on 512x512
45 |     qk_norm=True,
46 |     qk_norm_legacy=True,
47 |     enable_flash_attn=True,
48 |     enable_layernorm_kernel=True,
49 | )
50 | vae = dict(
51 |     type="VideoAutoencoderKL",
52 |     from_pretrained="stabilityai/sd-vae-ft-ema",
53 |     micro_batch_size=4,
54 |     local_files_only=True,
55 | )
56 | text_encoder = dict(
57 |     type="t5",
58 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
59 |     model_max_length=200,
60 |     shardformer=True,
61 |     local_files_only=True,
62 | )
63 | scheduler = dict(
64 |     type="iddpm",
65 |     timestep_respacing="",
66 | )
67 | 
68 | # Others
69 | seed = 42
70 | outputs = "outputs"
71 | wandb = False
72 | 
73 | epochs = 1000
74 | log_every = 10
75 | ckpt_every = 500
76 | load = None
77 | 
78 | batch_size = None
79 | lr = 2e-5
80 | grad_clip = 1.0
81 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-1/train/stage3.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=None,
 6 |     frame_interval=3,
 7 |     image_size=(None, None),
 8 |     transform_name="resize_crop",
 9 | )
10 | bucket_config = {  # 13s/it
11 |     "144p": {1: (1.0, 200), 16: (1.0, 36), 32: (1.0, 18), 64: (1.0, 9), 128: (1.0, 4)},
12 |     "256": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 11), 64: (0.5, 6), 128: (0.8, 4)},
13 |     "240p": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 10), 64: (0.5, 6), 128: (0.5, 3)},
14 |     "360p": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.5, 1)},
15 |     "512": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.8, 1)},
16 |     "480p": {1: (0.4, 80), 16: (0.6, 6), 32: (0.6, 3), 64: (0.6, 1), 128: (0.0, None)},
17 |     "720p": {1: (0.4, 40), 16: (0.6, 3), 32: (0.6, 1), 96: (0.0, None)},
18 |     "1024": {1: (0.3, 40)},
19 | }
20 | mask_ratios = {
21 |     "identity": 0.75,
22 |     "quarter_random": 0.025,
23 |     "quarter_head": 0.025,
24 |     "quarter_tail": 0.025,
25 |     "quarter_head_tail": 0.05,
26 |     "image_random": 0.025,
27 |     "image_head": 0.025,
28 |     "image_tail": 0.025,
29 |     "image_head_tail": 0.05,
30 | }
31 | 
32 | # Define acceleration
33 | num_workers = 8
34 | num_bucket_build_workers = 16
35 | dtype = "bf16"
36 | grad_checkpoint = True
37 | plugin = "zero2"
38 | sp_size = 1
39 | 
40 | # Define model
41 | model = dict(
42 |     type="STDiT2-XL/2",
43 |     from_pretrained=None,
44 |     input_sq_size=512,  # pretrained model is trained on 512x512
45 |     qk_norm=True,
46 |     qk_norm_legacy=True,
47 |     enable_flash_attn=True,
48 |     enable_layernorm_kernel=True,
49 | )
50 | vae = dict(
51 |     type="VideoAutoencoderKL",
52 |     from_pretrained="stabilityai/sd-vae-ft-ema",
53 |     micro_batch_size=4,
54 |     local_files_only=True,
55 | )
56 | text_encoder = dict(
57 |     type="t5",
58 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
59 |     model_max_length=200,
60 |     shardformer=True,
61 |     local_files_only=True,
62 | )
63 | scheduler = dict(
64 |     type="iddpm",
65 |     timestep_respacing="",
66 | )
67 | 
68 | # Others
69 | seed = 42
70 | outputs = "outputs"
71 | wandb = False
72 | 
73 | epochs = 1000
74 | log_every = 10
75 | ckpt_every = 500
76 | load = None
77 | 
78 | batch_size = None
79 | lr = 2e-5
80 | grad_clip = 1.0
81 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-1/train/video.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=None,
 6 |     frame_interval=3,
 7 |     image_size=(None, None),
 8 |     transform_name="resize_crop",
 9 | )
10 | bucket_config = {  # 6s/it
11 |     "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
12 |     "256": {1: (1.0, 256)},
13 |     "512": {1: (0.5, 80)},
14 |     "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)},
15 |     "720p": {16: (0.1, 2), 32: (0.0, None)},  # No examples now
16 |     "1024": {1: (0.3, 20)},
17 |     "1080p": {1: (0.3, 8)},
18 | }
19 | 
20 | # Define acceleration
21 | num_workers = 4
22 | num_bucket_build_workers = 16
23 | dtype = "bf16"
24 | grad_checkpoint = True
25 | plugin = "zero2"
26 | sp_size = 1
27 | 
28 | # Define model
29 | model = dict(
30 |     type="STDiT2-XL/2",
31 |     from_pretrained=None,
32 |     input_sq_size=512,  # pretrained model is trained on 512x512
33 |     qk_norm=True,
34 |     qk_norm_legacy=True,
35 |     enable_flash_attn=True,
36 |     enable_layernorm_kernel=True,
37 | )
38 | vae = dict(
39 |     type="VideoAutoencoderKL",
40 |     from_pretrained="stabilityai/sd-vae-ft-ema",
41 |     micro_batch_size=4,
42 |     local_files_only=True,
43 | )
44 | text_encoder = dict(
45 |     type="t5",
46 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
47 |     model_max_length=200,
48 |     shardformer=True,
49 |     local_files_only=True,
50 | )
51 | scheduler = dict(
52 |     type="iddpm",
53 |     timestep_respacing="",
54 | )
55 | 
56 | # Others
57 | seed = 42
58 | outputs = "outputs"
59 | wandb = False
60 | 
61 | epochs = 1000
62 | log_every = 10
63 | ckpt_every = 500
64 | load = None
65 | 
66 | batch_size = 10  # only for logging
67 | lr = 2e-5
68 | grad_clip = 1.0
69 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/inference/sample.py:
--------------------------------------------------------------------------------
 1 | resolution = "240p"
 2 | aspect_ratio = "9:16"
 3 | num_frames = 51
 4 | fps = 24
 5 | frame_interval = 1
 6 | save_fps = 24
 7 | 
 8 | save_dir = "./samples/samples/"
 9 | seed = 42
10 | batch_size = 1
11 | multi_resolution = "STDiT2"
12 | dtype = "bf16"
13 | condition_frame_length = 5
14 | align = 5
15 | 
16 | model = dict(
17 |     type="STDiT3-XL/2",
18 |     from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
19 |     qk_norm=True,
20 |     enable_flash_attn=True,
21 |     enable_layernorm_kernel=True,
22 | )
23 | vae = dict(
24 |     type="OpenSoraVAE_V1_2",
25 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
26 |     micro_frame_size=17,
27 |     micro_batch_size=4,
28 | )
29 | text_encoder = dict(
30 |     type="t5",
31 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
32 |     model_max_length=300,
33 | )
34 | scheduler = dict(
35 |     type="rflow",
36 |     use_timestep_transform=True,
37 |     num_sampling_steps=30,
38 |     cfg_scale=7.0,
39 | )
40 | 
41 | aes = 6.5
42 | flow = None
43 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/inference/sample_hf.py:
--------------------------------------------------------------------------------
 1 | resolution = "240p"
 2 | aspect_ratio = "9:16"
 3 | num_frames = 51
 4 | fps = 24
 5 | frame_interval = 1
 6 | save_fps = 24
 7 | 
 8 | save_dir = "./samples/samples/"
 9 | seed = 42
10 | batch_size = 1
11 | multi_resolution = "STDiT2"
12 | dtype = "bf16"
13 | condition_frame_length = 5
14 | align = 5
15 | 
16 | model = dict(
17 |     type="STDiT3-XL/2",
18 |     from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
19 |     qk_norm=True,
20 |     enable_flash_attn=True,
21 |     enable_layernorm_kernel=True,
22 |     force_huggingface=True,
23 | )
24 | vae = dict(
25 |     type="OpenSoraVAE_V1_2",
26 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
27 |     micro_frame_size=17,
28 |     micro_batch_size=4,
29 |     force_huggingface=True,
30 | )
31 | text_encoder = dict(
32 |     type="t5",
33 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
34 |     model_max_length=300,
35 | )
36 | scheduler = dict(
37 |     type="rflow",
38 |     use_timestep_transform=True,
39 |     num_sampling_steps=30,
40 |     cfg_scale=7.0,
41 | )
42 | 
43 | aes = 6.5
44 | flow = None
45 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/lambda/stage2.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 | )
 6 | 
 7 | # webvid
 8 | bucket_config = {  # 12s/it
 9 |     "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
10 |     # ---
11 |     "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)},
12 |     "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)},
13 |     # ---
14 |     "360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)},
15 |     "512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)},
16 |     # ---
17 |     "480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)},
18 |     # ---
19 |     "720p": {1: (0.1, 36), 51: (0.03, 1)},
20 |     "1024": {1: (0.1, 36), 51: (0.02, 1)},
21 |     # ---
22 |     "1080p": {1: (0.01, 5)},
23 |     # ---
24 |     "2048": {1: (0.01, 5)},
25 | }
26 | 
27 | grad_checkpoint = True
28 | 
29 | # Acceleration settings
30 | num_workers = 8
31 | num_bucket_build_workers = 16
32 | #prefetch_factor = 3
33 | dtype = "bf16"
34 | plugin = "zero2"
35 | 
36 | # Model settings
37 | model = dict(
38 |     type="STDiT3-XL/2",
39 |     from_pretrained=None,
40 |     qk_norm=True,
41 |     enable_flash_attn=True,
42 |     enable_layernorm_kernel=True,
43 |     freeze_y_embedder=True,
44 | )
45 | vae = dict(
46 |     type="OpenSoraVAE_V1_2",
47 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
48 |     micro_frame_size=17,
49 |     micro_batch_size=4,
50 | )
51 | text_encoder = dict(
52 |     type="t5",
53 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
54 |     model_max_length=300,
55 |     shardformer=True,
56 | )
57 | scheduler = dict(
58 |     type="rflow",
59 |     use_timestep_transform=True,
60 |     sample_method="logit-normal",
61 | )
62 | 
63 | # Mask settings
64 | # 25%
65 | mask_ratios = {
66 |     "random": 0.005,
67 |     "intepolate": 0.002,
68 |     "quarter_random": 0.007,
69 |     "quarter_head": 0.002,
70 |     "quarter_tail": 0.002,
71 |     "quarter_head_tail": 0.002,
72 |     "image_random": 0.0,
73 |     "image_head": 0.22,
74 |     "image_tail": 0.005,
75 |     "image_head_tail": 0.005,
76 | }
77 | 
78 | 
79 | # Log settings
80 | seed = 42
81 | outputs = "outputs_speedrun"
82 | wandb = True
83 | epochs = 5
84 | log_every = 10
85 | ckpt_every = 100
86 | 
87 | # optimization settings
88 | load = None
89 | grad_clip = 1.0
90 | lr = 0.00016
91 | ema_decay = 0.99
92 | adam_eps = 1e-15
93 | warmup_steps = 400
94 | weight_decay = 0.01


--------------------------------------------------------------------------------
/configs/opensora-v1-2/lambda/stage3.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 | )
 6 | 
 7 | # webvid
 8 | bucket_config = {  # 20s/it
 9 |     "144p": {1: (1.0, 475), 51: (1.0, 51), 102: (1.0, 27), 204: (1.0, 13), 408: (1.0, 6)},
10 |     # ---
11 |     "256": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.5), 2)},
12 |     "240p": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.4), 2)},
13 |     # ---
14 |     "360p": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.3), 1)},
15 |     "512": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.2), 1)},
16 |     # ---
17 |     "480p": {1: (1.0, 89), 51: (0.5, 5), 102: (0.5, 3), 204: ((0.5, 0.5), 1), 408: (0.0, None)},
18 |     # ---
19 |     "720p": {1: (0.3, 36), 51: (0.2, 2), 102: (0.1, 1), 204: (0.0, None)},
20 |     "1024": {1: (0.3, 36), 51: (0.1, 2), 102: (0.1, 1), 204: (0.0, None)},
21 |     # ---
22 |     "1080p": {1: (0.1, 5)},
23 |     # ---
24 |     "2048": {1: (0.05, 5)},
25 | }
26 | 
27 | grad_checkpoint = True
28 | 
29 | # Acceleration settings
30 | num_workers = 8
31 | num_bucket_build_workers = 16
32 | #prefetch_factor = 2
33 | dtype = "bf16"
34 | plugin = "zero2"
35 | 
36 | # Model settings
37 | model = dict(
38 |     type="STDiT3-XL/2",
39 |     from_pretrained=None,
40 |     qk_norm=True,
41 |     enable_flash_attn=True,
42 |     enable_layernorm_kernel=True,
43 |     freeze_y_embedder=True,
44 | )
45 | vae = dict(
46 |     type="OpenSoraVAE_V1_2",
47 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
48 |     micro_frame_size=17,
49 |     micro_batch_size=4,
50 | )
51 | text_encoder = dict(
52 |     type="t5",
53 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
54 |     model_max_length=300,
55 |     shardformer=True,
56 | )
57 | scheduler = dict(
58 |     type="rflow",
59 |     use_timestep_transform=True,
60 |     sample_method="logit-normal",
61 | )
62 | 
63 | # Mask settings
64 | # 25%
65 | mask_ratios = {
66 |     "random": 0.01,
67 |     "intepolate": 0.002,
68 |     "quarter_random": 0.002,
69 |     "quarter_head": 0.002,
70 |     "quarter_tail": 0.002,
71 |     "quarter_head_tail": 0.002,
72 |     "image_random": 0.0,
73 |     "image_head": 0.22,
74 |     "image_tail": 0.005,
75 |     "image_head_tail": 0.005,
76 | }
77 | 
78 | # Log settings
79 | seed = 42
80 | outputs = "outputs_speedrun"
81 | wandb = True
82 | epochs = 5
83 | log_every = 10
84 | ckpt_every = 100
85 | 
86 | # optimization settings
87 | load = None
88 | grad_clip = 1.0
89 | lr = 2e-4
90 | ema_decay = 0.99
91 | adam_eps = 1e-15
92 | warmup_steps = 1000
93 | weight_decay = 0.01
94 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/lambda/stage4.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 | )
 6 | 
 7 | # webvid
 8 | bucket_config = {  # 12s/it
 9 |     "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
10 |     # ---
11 |     "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
12 |     "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
13 |     # ---
14 |     "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
15 |     "512": {1: (0.1, 141)},
16 |     # ---
17 |     "480p": {1: (0.1, 89)},
18 |     # ---
19 |     "720p": {1: (0.05, 36)},
20 |     "1024": {1: (0.05, 36)},
21 |     # ---
22 |     "1080p": {1: (0.1, 5)},
23 |     # ---
24 |     "2048": {1: (0.1, 5)},
25 | }
26 | 
27 | grad_checkpoint = True
28 | 
29 | # Acceleration settings
30 | num_workers = 8
31 | num_bucket_build_workers = 16
32 | #prefetch_factor = 2
33 | dtype = "bf16"
34 | plugin = "zero2"
35 | 
36 | # Model settings
37 | model = dict(
38 |     type="STDiT3-XL/2",
39 |     from_pretrained=None,
40 |     qk_norm=True,
41 |     enable_flash_attn=True,
42 |     enable_layernorm_kernel=True,
43 |     freeze_y_embedder=True,
44 | )
45 | vae = dict(
46 |     type="OpenSoraVAE_V1_2",
47 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
48 |     micro_frame_size=17,
49 |     micro_batch_size=4,
50 | )
51 | text_encoder = dict(
52 |     type="t5",
53 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
54 |     model_max_length=300,
55 |     shardformer=True,
56 | )
57 | scheduler = dict(
58 |     type="rflow",
59 |     use_timestep_transform=True,
60 |     sample_method="logit-normal",
61 | )
62 | 
63 | # Mask settings
64 | mask_ratios = {
65 |     "random": 0.0
66 | }
67 | 
68 | # Log settings
69 | seed = 42
70 | outputs = "outputs_speedrun"
71 | wandb = True
72 | epochs = 10
73 | log_every = 10
74 | ckpt_every = 100
75 | 
76 | # optimization settings
77 | load = None
78 | grad_clip = 1.0
79 | ema_decay = 0.99
80 | adam_eps = 1e-15
81 | weight_decay = 0.01
82 | 
83 | # lr scheduler
84 | lr_schedule = "1cycle"
85 | anneal_strategy = "cos"
86 | warmup_steps = 400
87 | cooldown_steps = 400
88 | lr = 1.6e-4
89 | min_lr = 1.6e-5
90 | max_lr=6.4e-4


--------------------------------------------------------------------------------
/configs/opensora-v1-2/lambda/stage5.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 | )
 6 | 
 7 | # webvid
 8 | bucket_config = {  # 12s/it
 9 |     "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
10 |     # ---
11 |     "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)},
12 |     "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)},
13 |     # ---
14 |     "360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)},
15 |     "512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)},
16 |     # ---
17 |     "480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)},
18 |     # ---
19 |     "720p": {1: (0.1, 36), 51: (0.03, 1)},
20 |     "1024": {1: (0.1, 36), 51: (0.02, 1)},
21 |     # ---
22 |     "1080p": {1: (0.01, 5)},
23 |     # ---
24 |     "2048": {1: (0.01, 5)},
25 | }
26 | 
27 | grad_checkpoint = True
28 | 
29 | # Acceleration settings
30 | num_workers = 8
31 | num_bucket_build_workers = 16
32 | #prefetch_factor = 2
33 | dtype = "bf16"
34 | plugin = "zero2"
35 | 
36 | # Model settings
37 | model = dict(
38 |     type="STDiT3-XL/2",
39 |     from_pretrained=None,
40 |     qk_norm=True,
41 |     enable_flash_attn=True,
42 |     enable_layernorm_kernel=True,
43 |     freeze_y_embedder=True,
44 | )
45 | vae = dict(
46 |     type="OpenSoraVAE_V1_2",
47 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
48 |     micro_frame_size=17,
49 |     micro_batch_size=4,
50 | )
51 | text_encoder = dict(
52 |     type="t5",
53 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
54 |     model_max_length=300,
55 |     shardformer=True,
56 | )
57 | scheduler = dict(
58 |     type="rflow",
59 |     use_timestep_transform=True,
60 |     sample_method="logit-normal",
61 | )
62 | 
63 | # Mask settings
64 | mask_ratios = {
65 |     "random": 0.0
66 | }
67 | 
68 | # Log settings
69 | seed = 42
70 | outputs = "outputs_speedrun"
71 | wandb = True
72 | epochs = 10
73 | log_every = 10
74 | ckpt_every = 100
75 | 
76 | # optimization settings
77 | load = None
78 | grad_clip = 1.0
79 | ema_decay = 0.99
80 | adam_eps = 1e-15
81 | weight_decay = 0.01
82 | 
83 | # lr scheduler
84 | lr_schedule = "1cycle"
85 | anneal_strategy = "cos"
86 | warmup_steps = 400
87 | cooldown_steps = 400
88 | lr = 1.6e-4
89 | min_lr = 1.6e-5
90 | max_lr=6.4e-4


--------------------------------------------------------------------------------
/configs/opensora-v1-2/lambda/stage6.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 | )
 6 | 
 7 | # webvid
 8 | bucket_config = {  # 20s/it
 9 |     "144p": {1: (1.0, 475), 51: (1.0, 51), 102: (1.0, 27), 204: (1.0, 13), 408: (1.0, 6)},
10 |     # ---
11 |     "256": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.5), 2)},
12 |     "240p": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.4), 2)},
13 |     # ---
14 |     "360p": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.3), 1)},
15 |     "512": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.2), 1)},
16 |     # ---
17 |     "480p": {1: (1.0, 89), 51: (0.5, 5), 102: (0.5, 3), 204: ((0.5, 0.5), 1), 408: (0.0, None)},
18 |     # ---
19 |     "720p": {1: (0.3, 36), 51: (0.2, 2), 102: (0.1, 1), 204: (0.0, None)},
20 |     "1024": {1: (0.3, 36), 51: (0.1, 2), 102: (0.1, 1), 204: (0.0, None)},
21 |     # ---
22 |     "1080p": {1: (0.1, 5)},
23 |     # ---
24 |     "2048": {1: (0.05, 5)},
25 | }
26 | 
27 | grad_checkpoint = True
28 | 
29 | # Acceleration settings
30 | num_workers = 8
31 | num_bucket_build_workers = 16
32 | #prefetch_factor = 2
33 | dtype = "bf16"
34 | plugin = "zero2"
35 | 
36 | # Model settings
37 | model = dict(
38 |     type="STDiT3-XL/2",
39 |     from_pretrained=None,
40 |     qk_norm=True,
41 |     enable_flash_attn=True,
42 |     enable_layernorm_kernel=True,
43 |     freeze_y_embedder=True,
44 | )
45 | vae = dict(
46 |     type="OpenSoraVAE_V1_2",
47 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
48 |     micro_frame_size=17,
49 |     micro_batch_size=4,
50 | )
51 | text_encoder = dict(
52 |     type="t5",
53 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
54 |     model_max_length=300,
55 |     shardformer=True,
56 | )
57 | scheduler = dict(
58 |     type="rflow",
59 |     use_timestep_transform=True,
60 |     sample_method="logit-normal",
61 | )
62 | 
63 | # Mask settings
64 | mask_ratios = {
65 |     "random": 0.0
66 | }
67 | 
68 | # Log settings
69 | seed = 42
70 | outputs = "outputs_speedrun"
71 | wandb = True
72 | epochs = 10
73 | log_every = 10
74 | ckpt_every = 100
75 | 
76 | # optimization settings
77 | load = None
78 | grad_clip = 1.0
79 | ema_decay = 0.99
80 | adam_eps = 1e-15
81 | weight_decay = 0.01
82 | 
83 | # lr scheduler
84 | lr_schedule = "1cycle"
85 | anneal_strategy = "cos"
86 | warmup_steps = 400
87 | cooldown_steps = 400
88 | lr = 0.8e-4
89 | min_lr = 1.6e-5
90 | max_lr=3.2e-4


--------------------------------------------------------------------------------
/configs/opensora-v1-2/misc/eval_loss.py:
--------------------------------------------------------------------------------
 1 | num_workers = 8
 2 | dtype = "bf16"
 3 | seed = 42
 4 | num_eval_timesteps = 10
 5 | 
 6 | # Dataset settings
 7 | dataset = dict(
 8 |     type="VariableVideoTextDataset",
 9 |     transform_name="resize_crop",
10 | )
11 | 
12 | bucket_config = {
13 |     "144p": {1: (None, 100), 51: (None, 30), 102: (None, 20), 204: (None, 8), 408: (None, 4)},
14 |     # ---
15 |     "240p": {1: (None, 100), 51: (None, 24), 102: (None, 12), 204: (None, 4), 408: (None, 2)},
16 |     # ---
17 |     "360p": {1: (None, 60), 51: (None, 12), 102: (None, 6), 204: (None, 2), 408: (None, 1)},
18 |     # ---
19 |     "480p": {1: (None, 40), 51: (None, 6), 102: (None, 3), 204: (None, 1)},
20 |     # ---
21 |     "720p": {1: (None, 20), 51: (None, 2), 102: (None, 1)},
22 |     # ---
23 |     "1080p": {1: (None, 10)},
24 |     # ---
25 |     "2048": {1: (None, 5)},
26 | }
27 | 
28 | # Model settings
29 | model = dict(
30 |     type="STDiT3-XL/2",
31 |     from_pretrained=None,
32 |     qk_norm=True,
33 |     enable_flash_attn=True,
34 |     enable_layernorm_kernel=True,
35 | )
36 | vae = dict(
37 |     type="OpenSoraVAE_V1_2",
38 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
39 |     micro_frame_size=17,
40 |     micro_batch_size=4,
41 |     local_files_only=True,
42 | )
43 | text_encoder = dict(
44 |     type="t5",
45 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
46 |     model_max_length=300,
47 |     local_files_only=True,
48 | )
49 | scheduler = dict(type="rflow")
50 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/misc/extract.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 | )
 6 | 
 7 | # webvid
 8 | bucket_config = {  # 12s/it
 9 |     "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
10 |     # ---
11 |     "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
12 |     "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
13 |     # ---
14 |     "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
15 |     "512": {1: (0.1, 141)},
16 |     # ---
17 |     "480p": {1: (0.1, 89)},
18 |     # ---
19 |     "720p": {1: (0.05, 36)},
20 |     "1024": {1: (0.05, 36)},
21 |     # ---
22 |     "1080p": {1: (0.1, 5)},
23 |     # ---
24 |     "2048": {1: (0.1, 5)},
25 | }
26 | 
27 | # Acceleration settings
28 | num_workers = 8
29 | num_bucket_build_workers = 16
30 | dtype = "bf16"
31 | seed = 42
32 | outputs = "outputs"
33 | wandb = False
34 | 
35 | 
36 | # Model settings
37 | model = dict(
38 |     type="STDiT3-XL/2",
39 |     from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
40 |     qk_norm=True,
41 |     enable_flash_attn=True,
42 |     enable_layernorm_kernel=True,
43 | )
44 | vae = dict(
45 |     type="OpenSoraVAE_V1_2",
46 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
47 |     micro_frame_size=17,
48 |     micro_batch_size=32,
49 | )
50 | text_encoder = dict(
51 |     type="t5",
52 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
53 |     model_max_length=300,
54 |     shardformer=True,
55 |     local_files_only=True,
56 | )
57 | 
58 | # feature extraction settings
59 | save_text_features = True
60 | save_compressed_text_features = True
61 | bin_size = 250  # 1GB, 4195 bins
62 | log_time = False
63 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/misc/feat.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 |     dummy_text_feature=True,
 6 | )
 7 | 
 8 | # webvid
 9 | bucket_config = {  # 12s/it
10 |     "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
11 |     # ---
12 |     "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
13 |     "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
14 |     # ---
15 |     "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
16 |     "512": {1: (0.1, 141)},
17 |     # ---
18 |     "480p": {1: (0.1, 89)},
19 |     # ---
20 |     "720p": {1: (0.05, 36)},
21 |     "1024": {1: (0.05, 36)},
22 |     # ---
23 |     "1080p": {1: (0.1, 5)},
24 |     # ---
25 |     "2048": {1: (0.1, 5)},
26 | }
27 | 
28 | grad_checkpoint = True
29 | 
30 | load_text_features = True
31 | 
32 | # Acceleration settings
33 | num_workers = 0
34 | num_bucket_build_workers = 16
35 | dtype = "bf16"
36 | plugin = "zero2"
37 | 
38 | # Model settings
39 | model = dict(
40 |     type="STDiT3-XL/2",
41 |     from_pretrained=None,
42 |     qk_norm=True,
43 |     enable_flash_attn=True,
44 |     enable_layernorm_kernel=True,
45 |     freeze_y_embedder=True,
46 |     skip_y_embedder=True,
47 | )
48 | vae = dict(
49 |     type="OpenSoraVAE_V1_2",
50 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
51 |     micro_frame_size=17,
52 |     micro_batch_size=4,
53 | )
54 | text_encoder = dict(
55 |     type="t5",
56 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
57 |     model_max_length=300,
58 |     shardformer=True,
59 |     local_files_only=True,
60 | )
61 | scheduler = dict(
62 |     type="rflow",
63 |     use_timestep_transform=True,
64 |     sample_method="logit-normal",
65 | )
66 | 
67 | # Mask settings
68 | mask_ratios = {
69 |     "random": 0.2,
70 |     "intepolate": 0.01,
71 |     "quarter_random": 0.01,
72 |     "quarter_head": 0.01,
73 |     "quarter_tail": 0.01,
74 |     "quarter_head_tail": 0.01,
75 |     "image_random": 0.05,
76 |     "image_head": 0.1,
77 |     "image_tail": 0.05,
78 |     "image_head_tail": 0.05,
79 | }
80 | 
81 | # Log settings
82 | seed = 42
83 | outputs = "outputs"
84 | wandb = False
85 | epochs = 1000
86 | log_every = 10
87 | ckpt_every = 1
88 | 
89 | # optimization settings
90 | load = None
91 | grad_clip = 1.0
92 | lr = 2e-4
93 | ema_decay = 0.99
94 | adam_eps = 1e-15
95 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/train/adapt.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 | )
 6 | bucket_config = {  # 2s/it
 7 |     "144p": {1: (0.5, 48), 34: (1.0, 2), 51: (1.0, 4), 102: (1.0, 2), 204: (1.0, 1)},
 8 |     # ---
 9 |     "256": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)},
10 |     "240p": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)},
11 |     # ---
12 |     "360p": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)},
13 |     "512": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)},
14 |     # ---
15 |     "480p": {1: (0.2, 4), 17: (0.3, 1), 68: (0.0, None)},
16 |     # ---
17 |     "720p": {1: (0.1, 2)},
18 |     "1024": {1: (0.1, 2)},
19 |     # ---
20 |     "1080p": {1: (0.1, 1)},
21 | }
22 | grad_checkpoint = False
23 | 
24 | # Acceleration settings
25 | num_workers = 8
26 | num_bucket_build_workers = 16
27 | dtype = "bf16"
28 | plugin = "zero2"
29 | 
30 | # Model settings
31 | model = dict(
32 |     type="STDiT3-XL/2",
33 |     from_pretrained=None,
34 |     qk_norm=True,
35 |     enable_flash_attn=True,
36 |     enable_layernorm_kernel=True,
37 | )
38 | vae = dict(
39 |     type="OpenSoraVAE_V1_2",
40 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
41 |     micro_frame_size=17,
42 |     micro_batch_size=4,
43 | )
44 | text_encoder = dict(
45 |     type="t5",
46 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
47 |     model_max_length=300,
48 |     shardformer=True,
49 | )
50 | scheduler = dict(
51 |     type="rflow",
52 |     use_timestep_transform=True,
53 |     sample_method="logit-normal",
54 | )
55 | 
56 | # Mask settings
57 | mask_ratios = {
58 |     "random": 0.2,
59 |     "intepolate": 0.01,
60 |     "quarter_random": 0.01,
61 |     "quarter_head": 0.01,
62 |     "quarter_tail": 0.01,
63 |     "quarter_head_tail": 0.01,
64 |     "image_random": 0.05,
65 |     "image_head": 0.1,
66 |     "image_tail": 0.05,
67 |     "image_head_tail": 0.05,
68 | }
69 | 
70 | # Log settings
71 | seed = 42
72 | outputs = "outputs"
73 | wandb = False
74 | epochs = 1000
75 | log_every = 10
76 | ckpt_every = 500
77 | 
78 | # optimization settings
79 | load = None
80 | grad_clip = 1.0
81 | lr = 1e-4
82 | ema_decay = 0.99
83 | adam_eps = 1e-15
84 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/train/demo_360p.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 | )
 6 | 
 7 | # webvid
 8 | bucket_config = {"360p": {102: (1.0, 1)}}
 9 | grad_checkpoint = True
10 | 
11 | # Acceleration settings
12 | num_workers = 8
13 | num_bucket_build_workers = 16
14 | dtype = "bf16"
15 | plugin = "zero2"
16 | 
17 | # Model settings
18 | model = dict(
19 |     type="STDiT3-XL/2",
20 |     from_pretrained=None,
21 |     qk_norm=True,
22 |     enable_flash_attn=True,
23 |     enable_layernorm_kernel=True,
24 |     freeze_y_embedder=True,
25 | )
26 | vae = dict(
27 |     type="OpenSoraVAE_V1_2",
28 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
29 |     micro_frame_size=17,
30 |     micro_batch_size=4,
31 | )
32 | text_encoder = dict(
33 |     type="t5",
34 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
35 |     model_max_length=300,
36 |     shardformer=True,
37 | )
38 | scheduler = dict(
39 |     type="rflow",
40 |     use_timestep_transform=True,
41 |     sample_method="logit-normal",
42 | )
43 | 
44 | # Log settings
45 | seed = 42
46 | outputs = "outputs"
47 | wandb = False
48 | epochs = 1000
49 | log_every = 10
50 | ckpt_every = 200
51 | 
52 | # optimization settings
53 | load = None
54 | grad_clip = 1.0
55 | lr = 1e-4
56 | ema_decay = 0.99
57 | adam_eps = 1e-15
58 | warmup_steps = 1000
59 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/train/demo_480p.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 | )
 6 | 
 7 | # webvid
 8 | bucket_config = {"480p": {51: (0.5, 5)}}
 9 | grad_checkpoint = True
10 | 
11 | # Acceleration settings
12 | num_workers = 8
13 | num_bucket_build_workers = 16
14 | dtype = "bf16"
15 | plugin = "zero2"
16 | 
17 | # Model settings
18 | model = dict(
19 |     type="STDiT3-XL/2",
20 |     from_pretrained=None,
21 |     qk_norm=True,
22 |     enable_flash_attn=True,
23 |     enable_layernorm_kernel=True,
24 |     freeze_y_embedder=True,
25 | )
26 | vae = dict(
27 |     type="OpenSoraVAE_V1_2",
28 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
29 |     micro_frame_size=17,
30 |     micro_batch_size=4,
31 | )
32 | text_encoder = dict(
33 |     type="t5",
34 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
35 |     model_max_length=300,
36 |     shardformer=True,
37 | )
38 | scheduler = dict(
39 |     type="rflow",
40 |     use_timestep_transform=True,
41 |     sample_method="logit-normal",
42 | )
43 | 
44 | # Log settings
45 | seed = 42
46 | outputs = "outputs"
47 | wandb = False
48 | epochs = 1000
49 | log_every = 10
50 | ckpt_every = 200
51 | 
52 | # optimization settings
53 | load = None
54 | grad_clip = 1.0
55 | lr = 1e-4
56 | ema_decay = 0.99
57 | adam_eps = 1e-15
58 | warmup_steps = 1000
59 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/train/stage1_feat.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(type="BatchFeatureDataset")
 3 | grad_checkpoint = True
 4 | num_workers = 4
 5 | 
 6 | # Acceleration settings
 7 | dtype = "bf16"
 8 | plugin = "zero2"
 9 | 
10 | # Model settings
11 | model = dict(
12 |     type="STDiT3-XL/2",
13 |     from_pretrained=None,
14 |     qk_norm=True,
15 |     enable_flash_attn=True,
16 |     enable_layernorm_kernel=True,
17 |     freeze_y_embedder=True,
18 |     skip_y_embedder=True,
19 | )
20 | scheduler = dict(
21 |     type="rflow",
22 |     use_timestep_transform=True,
23 |     sample_method="logit-normal",
24 | )
25 | 
26 | vae_out_channels = 4
27 | model_max_length = 300
28 | text_encoder_output_dim = 4096
29 | load_video_features = True
30 | load_text_features = True
31 | 
32 | # Mask settings
33 | mask_ratios = {
34 |     "random": 0.2,
35 |     "intepolate": 0.01,
36 |     "quarter_random": 0.01,
37 |     "quarter_head": 0.01,
38 |     "quarter_tail": 0.01,
39 |     "quarter_head_tail": 0.01,
40 |     "image_random": 0.05,
41 |     "image_head": 0.1,
42 |     "image_tail": 0.05,
43 |     "image_head_tail": 0.05,
44 | }
45 | 
46 | # Log settings
47 | seed = 42
48 | outputs = "outputs"
49 | wandb = False
50 | epochs = 1000
51 | log_every = 10
52 | ckpt_every = 500
53 | 
54 | # optimization settings
55 | load = None
56 | grad_clip = 1.0
57 | lr = 2e-4
58 | ema_decay = 0.99
59 | adam_eps = 1e-15
60 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/train/stage2.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 | )
 6 | 
 7 | # webvid
 8 | bucket_config = {  # 12s/it
 9 |     "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
10 |     # ---
11 |     "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)},
12 |     "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)},
13 |     # ---
14 |     "360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)},
15 |     "512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)},
16 |     # ---
17 |     "480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)},
18 |     # ---
19 |     "720p": {1: (0.1, 36), 51: (0.03, 1)},
20 |     "1024": {1: (0.1, 36), 51: (0.02, 1)},
21 |     # ---
22 |     "1080p": {1: (0.01, 5)},
23 |     # ---
24 |     "2048": {1: (0.01, 5)},
25 | }
26 | 
27 | grad_checkpoint = True
28 | 
29 | # Acceleration settings
30 | num_workers = 8
31 | num_bucket_build_workers = 16
32 | dtype = "bf16"
33 | plugin = "zero2"
34 | 
35 | # Model settings
36 | model = dict(
37 |     type="STDiT3-XL/2",
38 |     from_pretrained=None,
39 |     qk_norm=True,
40 |     enable_flash_attn=True,
41 |     enable_layernorm_kernel=True,
42 |     freeze_y_embedder=True,
43 | )
44 | vae = dict(
45 |     type="OpenSoraVAE_V1_2",
46 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
47 |     micro_frame_size=17,
48 |     micro_batch_size=4,
49 | )
50 | text_encoder = dict(
51 |     type="t5",
52 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
53 |     model_max_length=300,
54 |     shardformer=True,
55 | )
56 | scheduler = dict(
57 |     type="rflow",
58 |     use_timestep_transform=True,
59 |     sample_method="logit-normal",
60 | )
61 | 
62 | # Mask settings
63 | # 25%
64 | mask_ratios = {
65 |     "random": 0.005,
66 |     "intepolate": 0.002,
67 |     "quarter_random": 0.007,
68 |     "quarter_head": 0.002,
69 |     "quarter_tail": 0.002,
70 |     "quarter_head_tail": 0.002,
71 |     "image_random": 0.0,
72 |     "image_head": 0.22,
73 |     "image_tail": 0.005,
74 |     "image_head_tail": 0.005,
75 | }
76 | 
77 | 
78 | # Log settings
79 | seed = 42
80 | outputs = "outputs"
81 | wandb = False
82 | epochs = 1000
83 | log_every = 10
84 | ckpt_every = 200
85 | 
86 | # optimization settings
87 | load = None
88 | grad_clip = 1.0
89 | lr = 1e-4
90 | ema_decay = 0.99
91 | adam_eps = 1e-15
92 | 


--------------------------------------------------------------------------------
/configs/opensora-v1-2/train/stage3.py:
--------------------------------------------------------------------------------
 1 | # Dataset settings
 2 | dataset = dict(
 3 |     type="VariableVideoTextDataset",
 4 |     transform_name="resize_crop",
 5 | )
 6 | 
 7 | # webvid
 8 | bucket_config = {  # 20s/it
 9 |     "144p": {1: (1.0, 475), 51: (1.0, 51), 102: (1.0, 27), 204: (1.0, 13), 408: (1.0, 6)},
10 |     # ---
11 |     "256": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.5), 2)},
12 |     "240p": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.4), 2)},
13 |     # ---
14 |     "360p": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.3), 1)},
15 |     "512": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.2), 1)},
16 |     # ---
17 |     "480p": {1: (1.0, 89), 51: (0.5, 5), 102: (0.5, 3), 204: ((0.5, 0.5), 1), 408: (0.0, None)},
18 |     # ---
19 |     "720p": {1: (0.3, 36), 51: (0.2, 2), 102: (0.1, 1), 204: (0.0, None)},
20 |     "1024": {1: (0.3, 36), 51: (0.1, 2), 102: (0.1, 1), 204: (0.0, None)},
21 |     # ---
22 |     "1080p": {1: (0.1, 5)},
23 |     # ---
24 |     "2048": {1: (0.05, 5)},
25 | }
26 | 
27 | grad_checkpoint = True
28 | 
29 | # Acceleration settings
30 | num_workers = 8
31 | num_bucket_build_workers = 16
32 | dtype = "bf16"
33 | plugin = "zero2"
34 | 
35 | # Model settings
36 | model = dict(
37 |     type="STDiT3-XL/2",
38 |     from_pretrained=None,
39 |     qk_norm=True,
40 |     enable_flash_attn=True,
41 |     enable_layernorm_kernel=True,
42 |     freeze_y_embedder=True,
43 | )
44 | vae = dict(
45 |     type="OpenSoraVAE_V1_2",
46 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
47 |     micro_frame_size=17,
48 |     micro_batch_size=4,
49 | )
50 | text_encoder = dict(
51 |     type="t5",
52 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
53 |     model_max_length=300,
54 |     shardformer=True,
55 | )
56 | scheduler = dict(
57 |     type="rflow",
58 |     use_timestep_transform=True,
59 |     sample_method="logit-normal",
60 | )
61 | 
62 | # Mask settings
63 | # 25%
64 | mask_ratios = {
65 |     "random": 0.01,
66 |     "intepolate": 0.002,
67 |     "quarter_random": 0.002,
68 |     "quarter_head": 0.002,
69 |     "quarter_tail": 0.002,
70 |     "quarter_head_tail": 0.002,
71 |     "image_random": 0.0,
72 |     "image_head": 0.22,
73 |     "image_tail": 0.005,
74 |     "image_head_tail": 0.005,
75 | }
76 | 
77 | # Log settings
78 | seed = 42
79 | outputs = "outputs"
80 | wandb = False
81 | epochs = 1000
82 | log_every = 10
83 | ckpt_every = 200
84 | 
85 | # optimization settings
86 | load = None
87 | grad_clip = 1.0
88 | lr = 1e-4
89 | ema_decay = 0.99
90 | adam_eps = 1e-15
91 | warmup_steps = 1000
92 | 


--------------------------------------------------------------------------------
/configs/opensora/inference/16x256x256.py:
--------------------------------------------------------------------------------
 1 | num_frames = 16
 2 | fps = 24 // 3
 3 | image_size = (256, 256)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="STDiT-XL/2",
 8 |     space_scale=0.5,
 9 |     time_scale=1.0,
10 |     enable_flash_attn=True,
11 |     enable_layernorm_kernel=True,
12 |     from_pretrained="PRETRAINED_MODEL",
13 | )
14 | vae = dict(
15 |     type="VideoAutoencoderKL",
16 |     from_pretrained="stabilityai/sd-vae-ft-ema",
17 |     micro_batch_size=4,
18 | )
19 | text_encoder = dict(
20 |     type="t5",
21 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
22 |     model_max_length=120,
23 | )
24 | scheduler = dict(
25 |     type="iddpm",
26 |     num_sampling_steps=100,
27 |     cfg_scale=7.0,
28 |     cfg_channel=3,  # or None
29 | )
30 | dtype = "bf16"
31 | 
32 | # Condition
33 | prompt_path = "./assets/texts/t2v_samples.txt"
34 | prompt = None  # prompt has higher priority than prompt_path
35 | 
36 | # Others
37 | batch_size = 1
38 | seed = 42
39 | save_dir = "./samples/samples/"
40 | 


--------------------------------------------------------------------------------
/configs/opensora/inference/16x512x512-rflow.py:
--------------------------------------------------------------------------------
 1 | num_frames = 16
 2 | fps = 24 // 3
 3 | image_size = (512, 512)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="STDiT-XL/2",
 8 |     space_scale=1.0,
 9 |     time_scale=1.0,
10 |     enable_flash_attn=True,
11 |     enable_layernorm_kernel=True,
12 |     from_pretrained="PRETRAINED_MODEL",
13 | )
14 | vae = dict(
15 |     type="VideoAutoencoderKL",
16 |     from_pretrained="stabilityai/sd-vae-ft-ema",
17 |     micro_batch_size=2,
18 | )
19 | text_encoder = dict(
20 |     type="t5",
21 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
22 |     model_max_length=120,
23 | )
24 | scheduler = dict(
25 |     type="rflow",
26 |     num_sampling_steps=10,
27 |     cfg_scale=7.0,
28 | )
29 | dtype = "bf16"
30 | 
31 | # Others
32 | batch_size = 2
33 | seed = 42
34 | prompt_path = "./assets/texts/t2v_samples.txt"
35 | save_dir = "./outputs/samples/"
36 | 


--------------------------------------------------------------------------------
/configs/opensora/inference/16x512x512.py:
--------------------------------------------------------------------------------
 1 | num_frames = 16
 2 | fps = 24 // 3
 3 | image_size = (512, 512)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="STDiT-XL/2",
 8 |     space_scale=1.0,
 9 |     time_scale=1.0,
10 |     enable_flash_attn=True,
11 |     enable_layernorm_kernel=True,
12 |     from_pretrained="PRETRAINED_MODEL",
13 | )
14 | vae = dict(
15 |     type="VideoAutoencoderKL",
16 |     from_pretrained="stabilityai/sd-vae-ft-ema",
17 |     micro_batch_size=2,
18 | )
19 | text_encoder = dict(
20 |     type="t5",
21 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
22 |     model_max_length=120,
23 | )
24 | scheduler = dict(
25 |     type="iddpm",
26 |     num_sampling_steps=100,
27 |     cfg_scale=7.0,
28 | )
29 | dtype = "bf16"
30 | 
31 | # Others
32 | batch_size = 2
33 | seed = 42
34 | prompt_path = "./assets/texts/t2v_samples.txt"
35 | save_dir = "./samples/samples/"
36 | 


--------------------------------------------------------------------------------
/configs/opensora/inference/64x512x512.py:
--------------------------------------------------------------------------------
 1 | num_frames = 64
 2 | fps = 24 // 2
 3 | image_size = (512, 512)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="STDiT-XL/2",
 8 |     space_scale=1.0,
 9 |     time_scale=2 / 3,
10 |     enable_flash_attn=True,
11 |     enable_layernorm_kernel=True,
12 |     from_pretrained="PRETRAINED_MODEL",
13 | )
14 | vae = dict(
15 |     type="VideoAutoencoderKL",
16 |     from_pretrained="stabilityai/sd-vae-ft-ema",
17 |     micro_batch_size=128,
18 | )
19 | text_encoder = dict(
20 |     type="t5",
21 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
22 |     model_max_length=120,
23 | )
24 | scheduler = dict(
25 |     type="iddpm",
26 |     num_sampling_steps=100,
27 |     cfg_scale=7.0,
28 | )
29 | dtype = "bf16"
30 | 
31 | # Others
32 | batch_size = 1
33 | seed = 42
34 | prompt_path = "./assets/texts/t2v_samples.txt"
35 | save_dir = "./samples/samples/"
36 | 


--------------------------------------------------------------------------------
/configs/opensora/train/16x256x256-mask.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=16,
 6 |     frame_interval=3,
 7 |     image_size=(256, 256),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="STDiT-XL/2",
20 |     space_scale=0.5,
21 |     time_scale=1.0,
22 |     from_pretrained="PixArt-XL-2-512x512.pth",
23 |     enable_flash_attn=True,
24 |     enable_layernorm_kernel=True,
25 | )
26 | mask_ratios = {
27 |     "identity": 0.7,
28 |     "random": 0.15,
29 |     "mask_head": 0.05,
30 |     "mask_tail": 0.05,
31 |     "mask_head_tail": 0.05,
32 | }
33 | vae = dict(
34 |     type="VideoAutoencoderKL",
35 |     from_pretrained="stabilityai/sd-vae-ft-ema",
36 | )
37 | text_encoder = dict(
38 |     type="t5",
39 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
40 |     model_max_length=120,
41 |     shardformer=True,
42 | )
43 | scheduler = dict(
44 |     type="iddpm",
45 |     timestep_respacing="",
46 | )
47 | 
48 | # Others
49 | seed = 42
50 | outputs = "outputs"
51 | wandb = False
52 | 
53 | epochs = 1000
54 | log_every = 10
55 | ckpt_every = 1000
56 | load = None
57 | 
58 | batch_size = 8
59 | lr = 2e-5
60 | grad_clip = 1.0
61 | 


--------------------------------------------------------------------------------
/configs/opensora/train/16x256x256-spee-rflow.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=16,
 6 |     frame_interval=3,
 7 |     image_size=(256, 256),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="STDiT-XL/2",
20 |     space_scale=0.5,
21 |     time_scale=1.0,
22 |     # from_pretrained="PixArt-XL-2-512x512.pth",
23 |     # from_pretrained = "/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/OpenSora-v1-HQ-16x512x512.pth",
24 |     # from_pretrained = "OpenSora-v1-HQ-16x512x512.pth",
25 |     from_pretrained="PRETRAINED_MODEL",
26 |     enable_flash_attn=True,
27 |     enable_layernorm_kernel=True,
28 | )
29 | # mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07]
30 | # mask_ratios = {
31 | #     "identity": 0.9,
32 | #     "random": 0.06,
33 | #     "mask_head": 0.01,
34 | #     "mask_tail": 0.01,
35 | #     "mask_head_tail": 0.02,
36 | # }
37 | vae = dict(
38 |     type="VideoAutoencoderKL",
39 |     from_pretrained="stabilityai/sd-vae-ft-ema",
40 | )
41 | text_encoder = dict(
42 |     type="t5",
43 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
44 |     model_max_length=120,
45 |     shardformer=True,
46 | )
47 | scheduler = dict(
48 |     type="rflow",
49 |     # timestep_respacing="",
50 | )
51 | 
52 | # Others
53 | seed = 42
54 | outputs = "outputs"
55 | wandb = True
56 | 
57 | epochs = 1
58 | log_every = 10
59 | ckpt_every = 1000
60 | load = None
61 | 
62 | batch_size = 16
63 | lr = 2e-5
64 | grad_clip = 1.0
65 | 


--------------------------------------------------------------------------------
/configs/opensora/train/16x256x256-spee.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=16,
 6 |     frame_interval=3,
 7 |     image_size=(256, 256),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="STDiT-XL/2",
20 |     space_scale=0.5,
21 |     time_scale=1.0,
22 |     from_pretrained="PixArt-XL-2-512x512.pth",
23 |     enable_flash_attn=True,
24 |     enable_layernorm_kernel=True,
25 | )
26 | mask_ratios = {
27 |     "identity": 0.5,
28 |     "random": 0.29,
29 |     "mask_head": 0.07,
30 |     "mask_tail": 0.07,
31 |     "mask_head_tail": 0.07,
32 | }
33 | vae = dict(
34 |     type="VideoAutoencoderKL",
35 |     from_pretrained="stabilityai/sd-vae-ft-ema",
36 | )
37 | text_encoder = dict(
38 |     type="t5",
39 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
40 |     model_max_length=120,
41 |     shardformer=True,
42 | )
43 | scheduler = dict(
44 |     type="iddpm-speed",
45 |     timestep_respacing="",
46 | )
47 | 
48 | # Others
49 | seed = 42
50 | outputs = "outputs"
51 | wandb = False
52 | 
53 | epochs = 1000
54 | log_every = 10
55 | ckpt_every = 1000
56 | load = None
57 | 
58 | batch_size = 8
59 | lr = 2e-5
60 | grad_clip = 1.0
61 | 


--------------------------------------------------------------------------------
/configs/opensora/train/16x256x256.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=16,
 6 |     frame_interval=3,
 7 |     image_size=(256, 256),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 0
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="STDiT-XL/2",
20 |     space_scale=0.5,
21 |     time_scale=1.0,
22 |     from_pretrained="PixArt-XL-2-512x512.pth",
23 |     enable_flash_attn=True,
24 |     enable_layernorm_kernel=True,
25 | )
26 | vae = dict(
27 |     type="VideoAutoencoderKL",
28 |     from_pretrained="stabilityai/sd-vae-ft-ema",
29 | )
30 | text_encoder = dict(
31 |     type="t5",
32 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
33 |     model_max_length=120,
34 |     shardformer=True,
35 | )
36 | scheduler = dict(
37 |     type="iddpm",
38 |     timestep_respacing="",
39 | )
40 | 
41 | # Others
42 | seed = 42
43 | outputs = "outputs"
44 | wandb = False
45 | 
46 | epochs = 1000
47 | log_every = 10
48 | ckpt_every = 1000
49 | load = None
50 | 
51 | batch_size = 8
52 | lr = 2e-5
53 | grad_clip = 1.0
54 | 


--------------------------------------------------------------------------------
/configs/opensora/train/16x512x512.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=16,
 6 |     frame_interval=3,
 7 |     image_size=(512, 512),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="STDiT-XL/2",
20 |     space_scale=1.0,
21 |     time_scale=1.0,
22 |     from_pretrained=None,
23 |     enable_flash_attn=True,
24 |     enable_layernorm_kernel=True,
25 | )
26 | vae = dict(
27 |     type="VideoAutoencoderKL",
28 |     from_pretrained="stabilityai/sd-vae-ft-ema",
29 |     micro_batch_size=128,
30 | )
31 | text_encoder = dict(
32 |     type="t5",
33 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
34 |     model_max_length=120,
35 |     shardformer=True,
36 | )
37 | scheduler = dict(
38 |     type="iddpm",
39 |     timestep_respacing="",
40 | )
41 | 
42 | # Others
43 | seed = 42
44 | outputs = "outputs"
45 | wandb = False
46 | 
47 | epochs = 1000
48 | log_every = 10
49 | ckpt_every = 500
50 | load = None
51 | 
52 | batch_size = 8
53 | lr = 2e-5
54 | grad_clip = 1.0
55 | 


--------------------------------------------------------------------------------
/configs/opensora/train/360x512x512.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=360,
 6 |     frame_interval=3,
 7 |     image_size=(512, 512),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define acceleration
18 | dtype = "bf16"
19 | grad_checkpoint = True
20 | plugin = "zero2-seq"
21 | sp_size = 2
22 | 
23 | # Define model
24 | model = dict(
25 |     type="STDiT-XL/2",
26 |     space_scale=1.0,
27 |     time_scale=2 / 3,
28 |     from_pretrained=None,
29 |     enable_flash_attn=True,
30 |     enable_layernorm_kernel=True,
31 |     enable_sequence_parallelism=True,  # enable sq here
32 | )
33 | vae = dict(
34 |     type="VideoAutoencoderKL",
35 |     from_pretrained="stabilityai/sd-vae-ft-ema",
36 |     micro_batch_size=128,
37 | )
38 | text_encoder = dict(
39 |     type="t5",
40 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
41 |     model_max_length=120,
42 |     shardformer=True,
43 | )
44 | scheduler = dict(
45 |     type="iddpm",
46 |     timestep_respacing="",
47 | )
48 | 
49 | # Others
50 | seed = 42
51 | outputs = "outputs"
52 | wandb = False
53 | 
54 | epochs = 1000
55 | log_every = 10
56 | ckpt_every = 250
57 | load = None
58 | 
59 | batch_size = 1
60 | lr = 2e-5
61 | grad_clip = 1.0
62 | 


--------------------------------------------------------------------------------
/configs/opensora/train/64x512x512-sp.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=16,
 6 |     frame_interval=3,
 7 |     image_size=(512, 512),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 2
16 | 
17 | # Define model
18 | model = dict(
19 |     type="STDiT-XL/2",
20 |     space_scale=1.0,
21 |     time_scale=2 / 3,
22 |     from_pretrained=None,
23 |     enable_flash_attn=True,
24 |     enable_layernorm_kernel=True,
25 |     enable_sequence_parallelism=True,  # enable sq here
26 | )
27 | vae = dict(
28 |     type="VideoAutoencoderKL",
29 |     from_pretrained="stabilityai/sd-vae-ft-ema",
30 | )
31 | text_encoder = dict(
32 |     type="t5",
33 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
34 |     model_max_length=120,
35 |     shardformer=True,
36 | )
37 | scheduler = dict(
38 |     type="iddpm",
39 |     timestep_respacing="",
40 | )
41 | 
42 | # Others
43 | seed = 42
44 | outputs = "outputs"
45 | wandb = False
46 | 
47 | epochs = 1000
48 | log_every = 10
49 | ckpt_every = 1000
50 | load = None
51 | 
52 | batch_size = 1
53 | lr = 2e-5
54 | grad_clip = 1.0
55 | 


--------------------------------------------------------------------------------
/configs/opensora/train/64x512x512.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=64,
 6 |     frame_interval=3,
 7 |     image_size=(512, 512),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="STDiT-XL/2",
20 |     space_scale=1.0,
21 |     time_scale=2 / 3,
22 |     from_pretrained=None,
23 |     enable_flash_attn=True,
24 |     enable_layernorm_kernel=True,
25 | )
26 | vae = dict(
27 |     type="VideoAutoencoderKL",
28 |     from_pretrained="stabilityai/sd-vae-ft-ema",
29 |     micro_batch_size=64,
30 | )
31 | text_encoder = dict(
32 |     type="t5",
33 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
34 |     model_max_length=120,
35 |     shardformer=True,
36 | )
37 | scheduler = dict(
38 |     type="iddpm",
39 |     timestep_respacing="",
40 | )
41 | 
42 | # Others
43 | seed = 42
44 | outputs = "outputs"
45 | wandb = False
46 | 
47 | epochs = 1000
48 | log_every = 10
49 | ckpt_every = 250
50 | load = None
51 | 
52 | batch_size = 4
53 | lr = 2e-5
54 | grad_clip = 1.0
55 | 


--------------------------------------------------------------------------------
/configs/pixart/inference/16x256x256.py:
--------------------------------------------------------------------------------
 1 | num_frames = 16
 2 | fps = 8
 3 | image_size = (256, 256)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="PixArt-XL/2",
 8 |     space_scale=0.5,
 9 |     time_scale=1.0,
10 |     from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt",
11 | )
12 | vae = dict(
13 |     type="VideoAutoencoderKL",
14 |     from_pretrained="stabilityai/sd-vae-ft-ema",
15 | )
16 | text_encoder = dict(
17 |     type="t5",
18 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
19 |     model_max_length=120,
20 | )
21 | scheduler = dict(
22 |     type="dpm-solver",
23 |     num_sampling_steps=20,
24 |     cfg_scale=7.0,
25 | )
26 | dtype = "bf16"
27 | 
28 | # Others
29 | batch_size = 2
30 | seed = 42
31 | prompt_path = "./assets/texts/t2v_samples.txt"
32 | save_dir = "./samples/samples/"
33 | 


--------------------------------------------------------------------------------
/configs/pixart/inference/1x1024MS.py:
--------------------------------------------------------------------------------
 1 | num_frames = 1
 2 | fps = 1
 3 | image_size = (1920, 512)
 4 | multi_resolution = "PixArtMS"
 5 | 
 6 | # Define model
 7 | model = dict(
 8 |     type="PixArtMS-XL/2",
 9 |     space_scale=2.0,
10 |     time_scale=1.0,
11 |     no_temporal_pos_emb=True,
12 |     from_pretrained="PixArt-XL-2-1024-MS.pth",
13 | )
14 | vae = dict(
15 |     type="VideoAutoencoderKL",
16 |     from_pretrained="stabilityai/sd-vae-ft-ema",
17 | )
18 | text_encoder = dict(
19 |     type="t5",
20 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
21 |     model_max_length=120,
22 | )
23 | scheduler = dict(
24 |     type="dpm-solver",
25 |     num_sampling_steps=20,
26 |     cfg_scale=7.0,
27 | )
28 | dtype = "bf16"
29 | 
30 | # Others
31 | batch_size = 2
32 | seed = 42
33 | prompt_path = "./assets/texts/t2i_samples.txt"
34 | save_dir = "./samples/samples/"
35 | 


--------------------------------------------------------------------------------
/configs/pixart/inference/1x20481B.py:
--------------------------------------------------------------------------------
 1 | num_frames = 1
 2 | fps = 1
 3 | image_size = (2560, 1536)
 4 | # image_size = (2048, 2048)
 5 | 
 6 | model = dict(
 7 |     type="PixArt-1B/2",
 8 |     from_pretrained="PixArt-1B-2.pth",
 9 |     space_scale=4,
10 |     no_temporal_pos_emb=True,
11 |     enable_flash_attn=True,
12 |     enable_layernorm_kernel=True,
13 |     base_size=2048 // 8,
14 | )
15 | vae = dict(
16 |     type="VideoAutoencoderKL",
17 |     from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
18 |     subfolder="vae",
19 | )
20 | text_encoder = dict(
21 |     type="t5",
22 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
23 |     model_max_length=300,
24 | )
25 | scheduler = dict(
26 |     type="dpm-solver",
27 |     num_sampling_steps=14,
28 |     cfg_scale=4.5,
29 | )
30 | dtype = "bf16"
31 | 
32 | # Others
33 | batch_size = 1
34 | seed = 42
35 | prompt_path = "./assets/texts/t2i_sigma.txt"
36 | save_dir = "./samples/samples/"
37 | 


--------------------------------------------------------------------------------
/configs/pixart/inference/1x2048MS.py:
--------------------------------------------------------------------------------
 1 | num_frames = 1
 2 | fps = 1
 3 | # image_size = (2560, 1536)
 4 | # image_size = (2048, 2048)
 5 | 
 6 | model = dict(
 7 |     type="PixArt-XL/2",
 8 |     from_pretrained="PixArt-Sigma-XL-2-2K-MS.pth",
 9 |     space_scale=4,
10 |     no_temporal_pos_emb=True,
11 |     enable_flash_attn=True,
12 |     enable_layernorm_kernel=True,
13 |     base_size=2048 // 8,
14 | )
15 | vae = dict(
16 |     type="VideoAutoencoderKL",
17 |     from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
18 |     subfolder="vae",
19 |     scaling_factor=0.13025,
20 | )
21 | text_encoder = dict(
22 |     type="t5",
23 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
24 |     model_max_length=300,
25 | )
26 | scheduler = dict(
27 |     type="dpm-solver",
28 |     num_sampling_steps=14,
29 |     cfg_scale=4.5,
30 | )
31 | dtype = "bf16"
32 | 
33 | # Others
34 | batch_size = 1
35 | seed = 42
36 | prompt_path = "./assets/texts/t2i_sigma.txt"
37 | save_dir = "./samples/samples/"
38 | 


--------------------------------------------------------------------------------
/configs/pixart/inference/1x256x256.py:
--------------------------------------------------------------------------------
 1 | num_frames = 1
 2 | fps = 1
 3 | image_size = (256, 256)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="PixArt-XL/2",
 8 |     space_scale=1.0,
 9 |     time_scale=1.0,
10 |     no_temporal_pos_emb=True,
11 |     from_pretrained="PixArt-XL-2-256x256.pth",
12 | )
13 | vae = dict(
14 |     type="VideoAutoencoderKL",
15 |     from_pretrained="stabilityai/sd-vae-ft-ema",
16 | )
17 | text_encoder = dict(
18 |     type="t5",
19 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
20 |     model_max_length=120,
21 | )
22 | scheduler = dict(
23 |     type="dpm-solver",
24 |     num_sampling_steps=20,
25 |     cfg_scale=7.0,
26 | )
27 | dtype = "bf16"
28 | 
29 | # Others
30 | batch_size = 2
31 | seed = 42
32 | prompt_path = "./assets/texts/t2i_samples.txt"
33 | save_dir = "./samples/samples/"
34 | 


--------------------------------------------------------------------------------
/configs/pixart/inference/1x512x512-rflow.py:
--------------------------------------------------------------------------------
 1 | num_frames = 1
 2 | fps = 1
 3 | image_size = (512, 512)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="PixArt-XL/2",
 8 |     space_scale=1.0,
 9 |     time_scale=1.0,
10 |     no_temporal_pos_emb=True,
11 |     from_pretrained="PRETRAINED_MODEL",
12 | )
13 | vae = dict(
14 |     type="VideoAutoencoderKL",
15 |     from_pretrained="stabilityai/sd-vae-ft-ema",
16 | )
17 | text_encoder = dict(
18 |     type="t5",
19 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
20 |     model_max_length=120,
21 | )
22 | scheduler = dict(
23 |     type="rflow",
24 |     num_sampling_steps=20,
25 |     cfg_scale=7.0,
26 | )
27 | dtype = "bf16"
28 | 
29 | # prompt_path = "./assets/texts/t2i_samples.txt"
30 | prompt = [
31 |     "Pirate ship trapped in a cosmic maelstrom nebula.",
32 |     "A small cactus with a happy face in the Sahara desert.",
33 |     "A small cactus with a sad face in the Sahara desert.",
34 | ]
35 | 
36 | # Others
37 | batch_size = 2
38 | seed = 42
39 | save_dir = "./outputs/samples2/"
40 | 


--------------------------------------------------------------------------------
/configs/pixart/inference/1x512x512.py:
--------------------------------------------------------------------------------
 1 | num_frames = 1
 2 | fps = 1
 3 | image_size = (512, 512)
 4 | 
 5 | # Define model
 6 | model = dict(
 7 |     type="PixArt-XL/2",
 8 |     space_scale=1.0,
 9 |     time_scale=1.0,
10 |     no_temporal_pos_emb=True,
11 |     from_pretrained="PixArt-XL-2-512x512.pth",
12 | )
13 | vae = dict(
14 |     type="VideoAutoencoderKL",
15 |     from_pretrained="stabilityai/sd-vae-ft-ema",
16 | )
17 | text_encoder = dict(
18 |     type="t5",
19 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
20 |     model_max_length=120,
21 | )
22 | scheduler = dict(
23 |     type="dpm-solver",
24 |     num_sampling_steps=20,
25 |     cfg_scale=7.0,
26 | )
27 | dtype = "bf16"
28 | 
29 | # prompt_path = "./assets/texts/t2i_samples.txt"
30 | prompt = [
31 |     "Pirate ship trapped in a cosmic maelstrom nebula.",
32 |     "A small cactus with a happy face in the Sahara desert.",
33 |     "A small cactus with a sad face in the Sahara desert.",
34 | ]
35 | 
36 | # Others
37 | batch_size = 2
38 | seed = 42
39 | save_dir = "./samples/samples/"
40 | 


--------------------------------------------------------------------------------
/configs/pixart/train/16x256x256.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=16,
 6 |     frame_interval=3,
 7 |     image_size=(256, 256),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="PixArt-XL/2",
20 |     space_scale=0.5,
21 |     time_scale=1.0,
22 |     from_pretrained="PixArt-XL-2-512x512.pth",
23 |     enable_flash_attn=True,
24 |     enable_layernorm_kernel=True,
25 | )
26 | vae = dict(
27 |     type="VideoAutoencoderKL",
28 |     from_pretrained="stabilityai/sd-vae-ft-ema",
29 | )
30 | text_encoder = dict(
31 |     type="t5",
32 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
33 |     model_max_length=120,
34 |     shardformer=True,
35 | )
36 | scheduler = dict(
37 |     type="iddpm",
38 |     timestep_respacing="",
39 | )
40 | 
41 | # Others
42 | seed = 42
43 | outputs = "outputs"
44 | wandb = False
45 | 
46 | epochs = 1000
47 | log_every = 10
48 | ckpt_every = 1000
49 | load = None
50 | 
51 | batch_size = 8
52 | lr = 2e-5
53 | grad_clip = 1.0
54 | 


--------------------------------------------------------------------------------
/configs/pixart/train/1x2048x2048.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path="/home/zhaowangbo/data/csv/image-v1_1_ext_noempty_rcp_clean_info.csv",
 5 |     num_frames=1,
 6 |     frame_interval=3,
 7 |     image_size=(2048, 2048),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="PixArt-1B/2",
20 |     space_scale=4.0,
21 |     no_temporal_pos_emb=True,
22 |     from_pretrained="PixArt-1B-2.pth",
23 |     enable_flash_attn=True,
24 |     enable_layernorm_kernel=True,
25 | )
26 | 
27 | vae = dict(
28 |     type="VideoAutoencoderKL",
29 |     from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
30 |     subfolder="vae",
31 | )
32 | text_encoder = dict(
33 |     type="t5",
34 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
35 |     model_max_length=300,
36 | )
37 | scheduler = dict(
38 |     type="iddpm",
39 |     timestep_respacing="",
40 | )
41 | 
42 | # Others
43 | seed = 42
44 | outputs = "outputs"
45 | wandb = False
46 | 
47 | epochs = 1000
48 | log_every = 10
49 | ckpt_every = 1000
50 | load = None
51 | 
52 | batch_size = 4
53 | lr = 2e-5
54 | grad_clip = 1.0
55 | 


--------------------------------------------------------------------------------
/configs/pixart/train/1x512x512-rflow.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=1,
 6 |     frame_interval=3,
 7 |     image_size=(512, 512),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="PixArt-XL/2",
20 |     space_scale=1.0,
21 |     time_scale=1.0,
22 |     no_temporal_pos_emb=True,
23 |     # from_pretrained="PixArt-XL-2-512x512.pth",
24 |     from_pretrained="PRETRAINED_MODEL",
25 |     enable_flash_attn=True,
26 |     enable_layernorm_kernel=True,
27 | )
28 | vae = dict(
29 |     type="VideoAutoencoderKL",
30 |     from_pretrained="stabilityai/sd-vae-ft-ema",
31 | )
32 | text_encoder = dict(
33 |     type="t5",
34 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
35 |     model_max_length=120,
36 |     shardformer=True,
37 | )
38 | scheduler = dict(
39 |     type="rflow",
40 |     # timestep_respacing="",
41 | )
42 | 
43 | # Others
44 | seed = 42
45 | outputs = "outputs"
46 | wandb = True
47 | 
48 | epochs = 2
49 | log_every = 10
50 | ckpt_every = 1000
51 | load = None
52 | 
53 | batch_size = 64
54 | lr = 2e-5
55 | grad_clip = 1.0
56 | 


--------------------------------------------------------------------------------
/configs/pixart/train/1x512x512.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=1,
 6 |     frame_interval=3,
 7 |     image_size=(512, 512),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | # Define model
18 | model = dict(
19 |     type="PixArt-XL/2",
20 |     space_scale=1.0,
21 |     time_scale=1.0,
22 |     no_temporal_pos_emb=True,
23 |     from_pretrained="PixArt-XL-2-512x512.pth",
24 |     enable_flash_attn=True,
25 |     enable_layernorm_kernel=True,
26 | )
27 | vae = dict(
28 |     type="VideoAutoencoderKL",
29 |     from_pretrained="stabilityai/sd-vae-ft-ema",
30 | )
31 | text_encoder = dict(
32 |     type="t5",
33 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
34 |     model_max_length=120,
35 |     shardformer=True,
36 | )
37 | scheduler = dict(
38 |     type="iddpm",
39 |     timestep_respacing="",
40 | )
41 | 
42 | # Others
43 | seed = 42
44 | outputs = "outputs"
45 | wandb = False
46 | 
47 | epochs = 1000
48 | log_every = 10
49 | ckpt_every = 1000
50 | load = None
51 | 
52 | batch_size = 32
53 | lr = 2e-5
54 | grad_clip = 1.0
55 | 


--------------------------------------------------------------------------------
/configs/pixart/train/64x512x512.py:
--------------------------------------------------------------------------------
 1 | # Define dataset
 2 | dataset = dict(
 3 |     type="VideoTextDataset",
 4 |     data_path=None,
 5 |     num_frames=64,
 6 |     frame_interval=3,
 7 |     image_size=(256, 256),
 8 | )
 9 | 
10 | # Define acceleration
11 | num_workers = 4
12 | dtype = "bf16"
13 | grad_checkpoint = True
14 | plugin = "zero2"
15 | sp_size = 1
16 | 
17 | 
18 | # Define model
19 | model = dict(
20 |     type="PixArt-XL/2",
21 |     space_scale=1.0,
22 |     time_scale=2 / 3,
23 |     from_pretrained=None,
24 |     enable_flash_attn=True,
25 |     enable_layernorm_kernel=True,
26 | )
27 | vae = dict(
28 |     type="VideoAutoencoderKL",
29 |     from_pretrained="stabilityai/sd-vae-ft-ema",
30 |     micro_batch_size=128,
31 | )
32 | text_encoder = dict(
33 |     type="t5",
34 |     from_pretrained="DeepFloyd/t5-v1_1-xxl",
35 |     model_max_length=120,
36 |     shardformer=True,
37 | )
38 | scheduler = dict(
39 |     type="iddpm",
40 |     timestep_respacing="",
41 | )
42 | 
43 | # Others
44 | seed = 42
45 | outputs = "outputs"
46 | wandb = False
47 | 
48 | epochs = 1000
49 | log_every = 10
50 | ckpt_every = 250
51 | load = None
52 | 
53 | batch_size = 4
54 | lr = 2e-5
55 | grad_clip = 1.0
56 | 


--------------------------------------------------------------------------------
/configs/vae/inference/image.py:
--------------------------------------------------------------------------------
 1 | image_size = (256, 256)
 2 | num_frames = 1
 3 | 
 4 | dtype = "bf16"
 5 | batch_size = 1
 6 | seed = 42
 7 | save_dir = "samples/vae_video"
 8 | cal_stats = True
 9 | log_stats_every = 100
10 | 
11 | # Define dataset
12 | dataset = dict(
13 |     type="VideoTextDataset",
14 |     data_path=None,
15 |     num_frames=num_frames,
16 |     image_size=image_size,
17 | )
18 | num_samples = 100
19 | num_workers = 4
20 | 
21 | # Define model
22 | model = dict(
23 |     type="OpenSoraVAE_V1_2",
24 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
25 |     micro_frame_size=None,
26 |     micro_batch_size=4,
27 |     cal_loss=True,
28 | )
29 | 
30 | # loss weights
31 | perceptual_loss_weight = 0.1  # use vgg is not None and more than 0
32 | kl_loss_weight = 1e-6
33 | 


--------------------------------------------------------------------------------
/configs/vae/inference/video.py:
--------------------------------------------------------------------------------
 1 | image_size = (256, 256)
 2 | num_frames = 51
 3 | 
 4 | dtype = "bf16"
 5 | batch_size = 1
 6 | seed = 42
 7 | save_dir = "samples/vae_video"
 8 | cal_stats = True
 9 | log_stats_every = 100
10 | 
11 | # Define dataset
12 | dataset = dict(
13 |     type="VideoTextDataset",
14 |     data_path=None,
15 |     num_frames=num_frames,
16 |     image_size=image_size,
17 | )
18 | num_samples = 100
19 | num_workers = 4
20 | 
21 | # Define model
22 | model = dict(
23 |     type="OpenSoraVAE_V1_2",
24 |     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
25 |     micro_frame_size=None,
26 |     micro_batch_size=4,
27 |     cal_loss=True,
28 | )
29 | 
30 | # loss weights
31 | perceptual_loss_weight = 0.1  # use vgg is not None and more than 0
32 | kl_loss_weight = 1e-6
33 | 


--------------------------------------------------------------------------------
/configs/vae/train/stage1.py:
--------------------------------------------------------------------------------
 1 | num_frames = 17
 2 | image_size = (256, 256)
 3 | 
 4 | # Define dataset
 5 | dataset = dict(
 6 |     type="VideoTextDataset",
 7 |     data_path=None,
 8 |     num_frames=num_frames,
 9 |     frame_interval=1,
10 |     image_size=image_size,
11 | )
12 | 
13 | # Define acceleration
14 | num_workers = 16
15 | dtype = "bf16"
16 | grad_checkpoint = True
17 | plugin = "zero2"
18 | 
19 | # Define model
20 | model = dict(
21 |     type="OpenSoraVAE_V1_2",
22 |     freeze_vae_2d=True,
23 |     from_pretrained=None,
24 |     cal_loss=True,
25 | )
26 | 
27 | # loss weights
28 | perceptual_loss_weight = 0.1  # use vgg is not None and more than 0
29 | kl_loss_weight = 1e-6
30 | 
31 | mixed_strategy = "mixed_video_image"
32 | mixed_image_ratio = 0.2
33 | use_real_rec_loss = False
34 | use_z_rec_loss = True
35 | use_image_identity_loss = True
36 | 
37 | # Others
38 | seed = 42
39 | outputs = "outputs/vae_stage1"
40 | wandb = False
41 | 
42 | epochs = 100  # NOTE: adjust accordingly w.r.t dataset size
43 | log_every = 1
44 | ckpt_every = 1000
45 | load = None
46 | 
47 | batch_size = 1
48 | lr = 1e-5
49 | grad_clip = 1.0
50 | 


--------------------------------------------------------------------------------
/configs/vae/train/stage2.py:
--------------------------------------------------------------------------------
 1 | num_frames = 17
 2 | image_size = (256, 256)
 3 | 
 4 | # Define dataset
 5 | dataset = dict(
 6 |     type="VideoTextDataset",
 7 |     data_path=None,
 8 |     num_frames=num_frames,
 9 |     frame_interval=1,
10 |     image_size=image_size,
11 | )
12 | 
13 | # Define acceleration
14 | num_workers = 16
15 | dtype = "bf16"
16 | grad_checkpoint = True
17 | plugin = "zero2"
18 | 
19 | # Define model
20 | model = dict(
21 |     type="OpenSoraVAE_V1_2",
22 |     freeze_vae_2d=False,
23 |     from_pretrained="outputs/vae_stage1",
24 |     cal_loss=True,
25 | )
26 | 
27 | # loss weights
28 | perceptual_loss_weight = 0.1  # use vgg is not None and more than 0
29 | kl_loss_weight = 1e-6
30 | 
31 | mixed_strategy = "mixed_video_image"
32 | mixed_image_ratio = 0.2
33 | use_real_rec_loss = False
34 | use_z_rec_loss = True
35 | use_image_identity_loss = False
36 | 
37 | # Others
38 | seed = 42
39 | outputs = "outputs/vae_stage2"
40 | wandb = False
41 | 
42 | epochs = 100  # NOTE: adjust accordingly w.r.t dataset size
43 | log_every = 1
44 | ckpt_every = 1000
45 | load = None
46 | 
47 | batch_size = 1
48 | lr = 1e-5
49 | grad_clip = 1.0
50 | 


--------------------------------------------------------------------------------
/configs/vae/train/stage3.py:
--------------------------------------------------------------------------------
 1 | num_frames = 33
 2 | image_size = (256, 256)
 3 | 
 4 | # Define dataset
 5 | dataset = dict(
 6 |     type="VideoTextDataset",
 7 |     data_path=None,
 8 |     num_frames=num_frames,
 9 |     frame_interval=1,
10 |     image_size=image_size,
11 | )
12 | 
13 | # Define acceleration
14 | num_workers = 16
15 | dtype = "bf16"
16 | grad_checkpoint = True
17 | plugin = "zero2"
18 | 
19 | # Define model
20 | model = dict(
21 |     type="OpenSoraVAE_V1_2",
22 |     freeze_vae_2d=False,
23 |     from_pretrained="outputs/vae_stage2",
24 |     cal_loss=True,
25 | )
26 | 
27 | # loss weights
28 | perceptual_loss_weight = 0.1  # use vgg is not None and more than 0
29 | kl_loss_weight = 1e-6
30 | 
31 | mixed_strategy = "mixed_video_random"
32 | use_real_rec_loss = True
33 | use_z_rec_loss = False
34 | use_image_identity_loss = False
35 | 
36 | # Others
37 | seed = 42
38 | outputs = "outputs/vae_stage3"
39 | wandb = False
40 | 
41 | epochs = 100  # NOTE: adjust accordingly w.r.t dataset size
42 | log_every = 1
43 | ckpt_every = 1000
44 | load = None
45 | 
46 | batch_size = 1
47 | lr = 1e-5
48 | grad_clip = 1.0
49 | 


--------------------------------------------------------------------------------
/docs/tutorial/.nojekyll:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docs/tutorial/Gemfile:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | source "https://rubygems.org"
 4 | 
 5 | gem "jekyll-theme-chirpy", "~> 7.1", ">= 7.1.1"
 6 | 
 7 | gem "html-proofer", "~> 5.0", group: :test
 8 | 
 9 | platforms :mingw, :x64_mingw, :mswin, :jruby do
10 |   gem "tzinfo", ">= 1", "< 3"
11 |   gem "tzinfo-data"
12 | end
13 | 
14 | gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin]
15 | 


--------------------------------------------------------------------------------
/docs/tutorial/_data/contact.yml:
--------------------------------------------------------------------------------
 1 | #  The contact options.
 2 | 
 3 | - type: github
 4 |   icon: "fab fa-github"
 5 | 
 6 | #- type: twitter
 7 | #  icon: "fa-brands fa-x-twitter"
 8 | 
 9 | #- type: email
10 | #  icon: "fas fa-envelope"
11 | #  noblank: true # open link in current tab
12 | 
13 | #- type: rss
14 | #  icon: "fas fa-rss"
15 | #  noblank: true
16 | # Uncomment and complete the url below to enable more contact options
17 | #
18 | # - type: mastodon
19 | #   icon: 'fab fa-mastodon'   # icons powered by <https://fontawesome.com/>
20 | #   url:  ''                  # Fill with your Mastodon account page, rel="me" will be applied for verification
21 | #
22 | # - type: linkedin
23 | #   icon: 'fab fa-linkedin'   # icons powered by <https://fontawesome.com/>
24 | #   url:  ''                  # Fill with your Linkedin homepage
25 | #
26 | # - type: stack-overflow
27 | #   icon: 'fab fa-stack-overflow'
28 | #   url:  ''                  # Fill with your stackoverflow homepage
29 | #
30 | # - type: bluesky
31 | #   icon: 'fa-brands fa-bluesky'
32 | #   url: ''                   # Fill with your Bluesky profile link
33 | #
34 | # - type: reddit
35 | #   icon: 'fa-brands fa-reddit'
36 | #   url: ''                   # Fill with your Reddit profile link
37 | #
38 | # - type: threads
39 | #   icon: 'fa-brands fa-threads'
40 | #   url: ''                   # Fill with your Threads profile link
41 | 


--------------------------------------------------------------------------------
/docs/tutorial/_data/locales/en-customized.yml:
--------------------------------------------------------------------------------
 1 | # The layout text of site
 2 | 
 3 | # ----- Commons label -----
 4 | 
 5 | layout:
 6 |   post: Post
 7 |   category: Category
 8 |   tag: Tag
 9 | 
10 | # The tabs of sidebar
11 | tabs:
12 |   # format: <filename_without_extension>: <value>
13 |   home: Tutorial
14 |   categories: Categories
15 |   tags: Tags
16 |   archives: Archives
17 |   about: About
18 | 
19 | # the text displayed in the search bar & search results
20 | search:
21 |   hint: search
22 |   cancel: Cancel
23 |   no_results: Oops! No results found.
24 | 
25 | panel:
26 |   lastmod: Recently Updated
27 |   trending_tags: Trending Tags
28 |   toc: Contents
29 | 
30 | copyright:
31 |   # Shown at the bottom of the post
32 |   license:
33 |     template: #This post is licensed under :LICENSE_NAME by the author.
34 |     name: #CC BY 4.0
35 |     link: #https://creativecommons.org/licenses/by/4.0/
36 | 
37 |   # Displayed in the footer
38 |   brief: #Some rights reserved.
39 |   verbose: #>-
40 |     #Except where otherwise noted, the blog posts on this site are licensed
41 |     #under the Creative Commons Attribution 4.0 International (CC BY 4.0) License by the author.
42 | 
43 | meta: #Using the :PLATFORM theme :THEME
44 | 
45 | not_found:
46 |   statment: Sorry, we've misplaced that URL or it's pointing to something that doesn't exist.
47 | 
48 | notification:
49 |   update_found: A new version of content is available.
50 |   update: Update
51 | 
52 | # ----- Posts related labels -----
53 | 
54 | post:
55 |   written_by: By
56 |   posted: Posted
57 |   updated: Updated
58 |   words: words
59 |   pageview_measure: views
60 |   read_time:
61 |     unit: min
62 |     prompt: read
63 |   relate_posts: Further Reading
64 |   share: Share
65 |   button:
66 |     next: Newer
67 |     previous: Older
68 |     copy_code:
69 |       succeed: Copied!
70 |     share_link:
71 |       title: Copy link
72 |       succeed: Link copied successfully!
73 | 
74 | # Date time format.
75 | # See: <http://strftime.net/>, <https://day.js.org/docs/en/display/format>
76 | df:
77 |   post:
78 |     strftime: "%b %e, %Y"
79 |     dayjs: "ll"
80 |   archives:
81 |     strftime: "%b"
82 |     dayjs: "MMM"
83 | 
84 | # categories page
85 | categories:
86 |   category_measure:
87 |     singular: category
88 |     plural: categories
89 |   post_measure:
90 |     singular: post
91 |     plural: posts


--------------------------------------------------------------------------------
/docs/tutorial/_data/share.yml:
--------------------------------------------------------------------------------
 1 | #  Sharing options at the bottom of the post.
 2 | #  Icons from <https://fontawesome.com/>
 3 | 
 4 | platforms:
 5 |   - type: Twitter
 6 |     icon: "fa-brands fa-square-x-twitter"
 7 |     link: "https://twitter.com/intent/tweet?text=TITLE&url=URL"
 8 | 
 9 |   - type: Facebook
10 |     icon: "fab fa-facebook-square"
11 |     link: "https://www.facebook.com/sharer/sharer.php?title=TITLE&u=URL"
12 | 
13 |   - type: Telegram
14 |     icon: "fab fa-telegram"
15 |     link: "https://t.me/share/url?url=URL&text=TITLE"
16 | 
17 |   # Uncomment below if you need to.
18 |   #
19 |   # - type: Linkedin
20 |   #   icon: "fab fa-linkedin"
21 |   #   link: "https://www.linkedin.com/sharing/share-offsite/?url=URL"
22 |   #
23 |   # - type: Weibo
24 |   #   icon: "fab fa-weibo"
25 |   #   link: "https://service.weibo.com/share/share.php?title=TITLE&url=URL"
26 |   #
27 |   # - type: Mastodon
28 |   #   icon: "fa-brands fa-mastodon"
29 |   #   # See: https://github.com/justinribeiro/share-to-mastodon#properties
30 |   #   instances:
31 |   #     - label: mastodon.social
32 |   #       link: "https://mastodon.social/"
33 |   #     - label: mastodon.online
34 |   #       link: "https://mastodon.online/"
35 |   #     - label: fosstodon.org
36 |   #       link: "https://fosstodon.org/"
37 |   #     - label: photog.social
38 |   #       link: "https://photog.social/"
39 |   #
40 |   # - type: Bluesky
41 |   #   icon: "fa-brands fa-bluesky"
42 |   #   link: "https://bsky.app/intent/compose?text=TITLE%20URL"
43 |   #
44 |   # - type: Reddit
45 |   #   icon: "fa-brands fa-square-reddit"
46 |   #   link: "https://www.reddit.com/submit?url=URL&title=TITLE"
47 |   #
48 |   # - type: Threads
49 |   #   icon: "fa-brands fa-square-threads"
50 |   #   link: "https://www.threads.net/intent/post?text=TITLE%20URL"
51 | 


--------------------------------------------------------------------------------
/docs/tutorial/_includes/favicons.html:
--------------------------------------------------------------------------------
1 | <meta name="apple-mobile-web-app-title" content="{{ site.title }}">
2 | <meta name="application-name" content="{{ site.title }}">
3 | <meta name="msapplication-TileColor" content="#7030c4">
4 | <meta name="theme-color" content="#7030c4">


--------------------------------------------------------------------------------
/docs/tutorial/_includes/topbar.html:
--------------------------------------------------------------------------------
 1 | <!-- The Top Bar -->
 2 | 
 3 | <header id="topbar-wrapper" aria-label="Top Bar">
 4 |     <div
 5 |       id="topbar"
 6 |       class="d-flex align-items-center justify-content-between px-lg-3 h-100"
 7 |     >
 8 |       <nav id="breadcrumb" aria-label="Breadcrumb">
 9 |         {% assign paths = page.url | split: '/' %}
10 |   
11 |         {% if paths.size == 0 or page.layout == 'home' %}
12 |           <!-- index page -->
13 |           <span>{{ site.data.locales[include.lang].tabs.home | capitalize }}</span>
14 |   
15 |         {% else %}
16 |           {% for item in paths %}
17 |             {% if forloop.first %}
18 |               <span>
19 |                 <a href="{{ '/' | relative_url }}">
20 |                   {{- site.data.locales[include.lang].tabs.home | capitalize -}}
21 |                 </a>
22 |               </span>
23 |   
24 |             {% elsif forloop.last %}
25 |               {% if page.collection == 'tabs' %}
26 |                 <span>{{ site.data.locales[include.lang].tabs[item] | default: page.title }}</span>
27 |               {% else %}
28 |                 <span>{{ page.title }}</span>
29 |               {% endif %}
30 |   
31 |             {% elsif page.layout == 'category' or page.layout == 'tag' %}
32 |               <span>
33 |                 <a href="{{ item | append: '/' | relative_url }}">
34 |                   {{- site.data.locales[include.lang].tabs[item] | default: page.title -}}
35 |                 </a>
36 |               </span>
37 |             {% endif %}
38 |           {% endfor %}
39 |         {% endif %}
40 |       </nav>
41 |       <!-- endof #breadcrumb -->
42 |   
43 |       <button type="button" id="sidebar-trigger" class="btn btn-link">
44 |         <i class="fas fa-bars fa-fw"></i>
45 |       </button>
46 |   
47 |       <div id="topbar-title">
48 |         {% if page.layout == 'home' %}
49 |           {{- site.data.locales[include.lang].title | default: site.title -}}
50 |         {% elsif page.collection == 'tabs' or page.layout == 'page' %}
51 |           {%- capture tab_key -%}{{ page.url | split: '/' }}{%- endcapture -%}
52 |           {{- site.data.locales[include.lang].tabs[tab_key] | default: page.title -}}
53 |         {% else %}
54 |           {{- site.data.locales[include.lang].layout[page.layout] | default: page.layout | capitalize -}}
55 |         {% endif %}
56 |       </div>
57 |     </div>
58 |   </header>


--------------------------------------------------------------------------------
/docs/tutorial/_plugins/details_tag.rb:
--------------------------------------------------------------------------------
 1 | module Jekyll
 2 |     module Tags
 3 |       class DetailsTag < Liquid::Block
 4 |   
 5 |         def initialize(tag_name, markup, tokens)
 6 |           super
 7 |           @caption = markup
 8 |         end
 9 |   
10 |         def render(context)
11 |           site = context.registers[:site]
12 |           converter = site.find_converter_instance(::Jekyll::Converters::Markdown)
13 |           # below Jekyll 3.x use this:
14 |           # converter = site.getConverterImpl(::Jekyll::Converters::Markdown)
15 |           caption = converter.convert(@caption).gsub(/<\/?p[^>]*>/, '').chomp
16 |           body = converter.convert(super(context))
17 |           "<details><summary>#{caption}</summary>#{body}</details>"
18 |         end
19 |   
20 |       end
21 |     end
22 |   end
23 |   
24 |   Liquid::Template.register_tag('details', Jekyll::Tags::DetailsTag)


--------------------------------------------------------------------------------
/docs/tutorial/_plugins/posts-lastmod-hook.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | #
 3 | # Check for changed posts
 4 | 
 5 | Jekyll::Hooks.register :posts, :post_init do |post|
 6 | 
 7 |   commit_num = `git rev-list --count HEAD "#{ post.path }"`
 8 | 
 9 |   if commit_num.to_i > 1
10 |     lastmod_date = `git log -1 --pretty="%ad" --date=iso "#{ post.path }"`
11 |     post.data['last_modified_at'] = lastmod_date
12 |   end
13 | 
14 | end
15 | 


--------------------------------------------------------------------------------
/docs/tutorial/_posts/.placeholder:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docs/tutorial/_tabs/repository.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: post
 3 | icon: fas fa-link
 4 | title: Github Repository
 5 | date: 2024-10-16
 6 | toc: true
 7 | order: 6
 8 | ---
 9 | 
10 | <script type="text/javascript">
11 |     window.location.href = "https://www.github.com/LambdaLabsML/Open-Sora"
12 | </script>
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/tutorial/assets/css/jekyll-theme-chirpy.scss:
--------------------------------------------------------------------------------
 1 | ---
 2 | ---
 3 | 
 4 | @import 'main
 5 | {%- if jekyll.environment == 'production' -%}
 6 |   .bundle
 7 | {%- endif -%}
 8 | ';
 9 | 
10 | @import 'colors/typography-dark.scss';
11 | @import 'colors/typography-light.scss';
12 | 
13 | /* append your custom style below */
14 | .todo { background: red ;}; 
15 | 
16 | .iframe-button {
17 |   width: 100%;
18 |   height: 150px;
19 |   display: flex;
20 |   justify-content: center;
21 |   align-items: center;
22 |   cursor: pointer;
23 | 
24 | }
25 | 
26 | html {
27 |     font-size: 16px;
28 |   
29 |     @media (prefers-color-scheme: light) {
30 |       &:not([data-mode]),
31 |       &[data-mode='light'] {
32 |         @include light-scheme;
33 |       }
34 |   
35 |       &[data-mode='dark'] {
36 |         @include dark-scheme;
37 |       }
38 |     }
39 |   
40 |     @media (prefers-color-scheme: dark) {
41 |       &:not([data-mode]),
42 |       &[data-mode='dark'] {
43 |         @include dark-scheme;
44 |       }
45 |   
46 |       &[data-mode='light'] {
47 |         @include light-scheme;
48 |       }
49 |     }
50 |   }


--------------------------------------------------------------------------------
/docs/tutorial/assets/fails_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/docs/tutorial/assets/fails_loss.png


--------------------------------------------------------------------------------
/docs/tutorial/assets/fails_weight_norm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/docs/tutorial/assets/fails_weight_norm.png


--------------------------------------------------------------------------------
/docs/tutorial/assets/img/lambda-logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <svg
 3 |    viewBox="0 0 121.94 120.38"
 4 |    version="1.1"
 5 |    id="svg1"
 6 |    width="121.94"
 7 |    height="120.38"
 8 |    xmlns="http://www.w3.org/2000/svg"
 9 |   >
10 |   <defs
11 |      id="defs1" />
12 |   <sodipodiNamedview
13 |      id="namedview1"
14 |      pagecolor="#ffffff"
15 |      bordercolor="#000000"
16 |      borderopacity="0.25"
17 |   />
18 |   <path
19 |      d="M 0,0 H 121.94 V 120.38 H 0 Z M 10.78,109.77 H 111.13 V 10.6 H 10.78 Z M 32.21,20.02 53.42,57.2 29.82,100.26 h 15.2 L 60.75,70.87 77.5,100.26 H 92.98 L 47.69,20.01 Z"
20 |      id="path1"
21 |   />
22 | </svg>
23 | 


--------------------------------------------------------------------------------
/docs/tutorial/assets/monitoring_tool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/docs/tutorial/assets/monitoring_tool.png


--------------------------------------------------------------------------------
/docs/tutorial/assets/pyspy_dump.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/docs/tutorial/assets/pyspy_dump.png


--------------------------------------------------------------------------------
/docs/tutorial/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: post
 3 | icon: fas fa-info-circle
 4 | permalink: '/'
 5 | title: "Tutorial - Let's reproduce a T2V model."
 6 | date: 2024-10-02
 7 | toc: true
 8 | ---
 9 | 
10 | <script type="text/javascript">
11 |     window.location.href = window.location.href+"introduction/";
12 | </script>
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/zh_CN/datasets.md:
--------------------------------------------------------------------------------
 1 | # 数据集
 2 | 
 3 | ## 正在使用的数据集
 4 | 
 5 | ### HD-VG-130M
 6 | 
 7 | [HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) 包括 130M 个文本视频对。标题是
 8 | 由 BLIP-2 生成。我们发现剪切和文本质量相对较差。它包含 20 个拆分。对于 OpenSora 1.0，我们使用第一个拆分。我们计划使用整个数据集并对其进行重新处理。
 9 | 
10 | ### Inter4k
11 | 
12 | [Inter4k](https://github.com/alexandrosstergiou/Inter4K) 是一个包含分辨率为 4K 的 1k 视频剪辑的数据集。这个
13 | 数据集被提议用于超分辨率任务。我们使用数据集进行 HQ 训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。
14 | 
15 | ### Pexels.com
16 | 
17 | [Pexels.com](https://www.pexels.com/) 是一个提供免费库存照片和视频的网站。我们收集的 19K 视频
18 | 来自本网站的剪辑，用于高质量训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。
19 | 
20 | ## 数据集监视列表
21 | 
22 | 我们也在关注以下数据集，并考虑在未来使用它们，这取决于我们的存储空间以及数据集的质量。
23 | 
24 | | 名称                | 大小           | 描述                            |
25 | |-------------------|--------------|-------------------------------|
26 | | Panda-70M         | 70M videos   | High quality video-text pairs |
27 | | WebVid-10M        | 10M videos   | Low quality                   |
28 | | InternVid-10M-FLT | 10M videos   |                               |
29 | | EGO4D             | 3670 hours   |                               |
30 | | OpenDV-YouTube    | 1700 hours   |                               |
31 | | VidProM           | 6.69M videos |                               |
32 | 


--------------------------------------------------------------------------------
/docs/zh_CN/report_v1.md:
--------------------------------------------------------------------------------
 1 | # Open-Sora v1 技术报告
 2 | 
 3 | OpenAI的Sora在生成一分钟高质量视频方面非常出色。然而，它几乎没有透露任何关于其细节的信息。为了使人工智能更加“开放”，我们致力于构建一个开源版本的Sora。这份报告描述了我们第一次尝试训练一个基于Transformer的视频扩散模型。
 4 | 
 5 | ## 选择高效的架构
 6 | 
 7 | 为了降低计算成本，我们希望利用现有的VAE模型。Sora使用时空VAE来减少时间维度。然而，我们发现没有开源的高质量时空VAE模型。[MAGVIT](https://github.com/google-research/magvit)的4x4x4 VAE并未开源，而[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的2x4x4 VAE在我们的实验中质量较低。因此，我们决定在我们第一个版本中使用2D VAE（来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)）。
 8 | 
 9 | 视频训练涉及大量的token。考虑到24fps的1分钟视频，我们有1440帧。通过VAE下采样4倍和patch大小下采样2倍，我们得到了1440x1024≈150万个token。在150万个token上进行全注意力计算将带来巨大的计算成本。因此，我们使用时空注意力来降低成本，这是遵循[Latte](https://github.com/Vchitect/Latte)的方法。
10 | 
11 | 如图中所示，在STDiT（ST代表时空）中，我们在每个空间注意力之后立即插入一个时间注意力。这类似于Latte论文中的变种3。然而，我们并没有控制这些变体的相似数量的参数。虽然Latte的论文声称他们的变体比变种3更好，但我们在16x256x256视频上的实验表明，相同数量的迭代次数下，性能排名为：DiT（完整）> STDiT（顺序）> STDiT（并行）≈ Latte。因此，我们出于效率考虑选择了STDiT（顺序）。[这里](/docs/acceleration.md#efficient-stdit)提供了速度基准测试。
12 | 
13 | 
14 | ![Architecture Comparison](/assets/readme/report_arch_comp.png)
15 | 
16 | 为了专注于视频生成，我们希望基于一个强大的图像生成模型来训练我们的模型。PixArt-α是一个经过高效训练的高质量图像生成模型，具有T5条件化的DiT结构。我们使用[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha)初始化我们的模型，并将插入的时间注意力的投影层初始化为零。这种初始化在开始时保留了模型的图像生成能力，而Latte的架构则不能。插入的注意力将参数数量从5.8亿增加到7.24亿。
17 | 
18 | ![Architecture](/assets/readme/report_arch.jpg)
19 | 
20 | 借鉴PixArt-α和Stable Video Diffusion的成功，我们还采用了渐进式训练策略：在366K预训练数据集上进行16x256x256的训练，然后在20K数据集上进行16x256x256、16x512x512和64x512x512的训练。通过扩展位置嵌入，这一策略极大地降低了计算成本。
21 | 
22 | 我们还尝试在DiT中使用3D patch嵌入器。然而，在时间维度上2倍下采样后，生成的视频质量较低。因此，我们将在下一版本中将下采样留给时间VAE。目前，我们在每3帧采样一次进行16帧训练，以及在每2帧采样一次进行64帧训练。
23 | 
24 | 
25 | ## 数据是训练高质量模型的核心
26 | 
27 | 我们发现数据的数量和质量对生成视频的质量有很大的影响，甚至比模型架构和训练策略的影响还要大。目前，我们只从[HD-VG-130M](https://github.com/daooshee/HD-VG-130M)准备了第一批分割（366K个视频片段）。这些视频的质量参差不齐，而且字幕也不够准确。因此，我们进一步从提供免费许可视频的[Pexels](https://www.pexels.com/)收集了20k相对高质量的视频。我们使用LLaVA，一个图像字幕模型，通过三个帧和一个设计好的提示来标记视频。有了设计好的提示，LLaVA能够生成高质量的字幕。
28 | 
29 | ![Caption](/assets/readme/report_caption.png)
30 | 
31 | 由于我们更加注重数据质量，我们准备收集更多数据，并在下一版本中构建一个视频预处理流程。
32 | 
33 | ## 训练细节
34 | 
35 | 在有限的训练预算下，我们只进行了一些探索。我们发现学习率1e-4过大，因此将其降低到2e-5。在进行大批量训练时，我们发现`fp16`比`bf16`不太稳定，可能会导致生成失败。因此，我们在64x512x512的训练中切换到`bf16`。对于其他超参数，我们遵循了之前的研究工作。
36 | 
37 | ## 损失曲线
38 | 
39 | 16x256x256 预训练损失曲线
40 | 
41 | ![16x256x256 Pretraining Loss Curve](/assets/readme/report_loss_curve_1.png)
42 | 
43 | 16x256x256 高质量训练损失曲线
44 | 
45 | ![16x256x256 HQ Training Loss Curve](/assets/readme/report_loss_curve_2.png)
46 | 
47 | 16x512x512 高质量训练损失曲线
48 | 
49 | ![16x512x512 HQ Training Loss Curve](/assets/readme/report_loss_curve_3.png)
50 | 


--------------------------------------------------------------------------------
/docs/zh_CN/vae.md:
--------------------------------------------------------------------------------
 1 | # VAE 技术报告
 2 | 
 3 | 由于 [Pixart-Sigma](https://arxiv.org/abs/2403.04692) 论文中指出适应新的VAE很简单，因此我们开发了一个额外的时间VAE。
 4 | 具体而言, 我们的VAE由一个[空间 VAE](https://huggingface.co/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers)和一个时间VA相接的形式组成.
 5 | 对于时间VAE，我们遵循 [MAGVIT-v2](https://arxiv.org/abs/2310.05737)的实现, 并做了以下修改:
 6 | 
 7 | * 我们删除了码本特有的架构。
 8 | * 我们不使用鉴别​​器（discriminator），而是使用VAE重建损失、kl损失和感知损失进行训练。
 9 | * 在编码器的最后一个线性层中，我们缩小到 4 通道的对角高斯分布，遵循我们之前训练的接受 4 通道输入的 STDiT。
10 | * 我们的解码器与编码器架构对称。
11 | 
12 | ## 训练
13 | 我们分不同阶段训练模型。
14 | 
15 | 我们首先通过在单台机器（8 个 GPU）上冻结空间 VAE 380k 步来训练时间 VAE。我们使用额外的身份损失使 3D VAE 的特征与 2D VAE 的特征相似。我们使用 20% 的图像和 80% 的视频（17 帧）来训练 VAE。
16 | 
17 | ```bash
18 | torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
19 | ```
20 | 
21 | 接下来，我们移除身份损失并训练 3D VAE 管道以重建 260k 步的 2D 压缩视频。
22 | 
23 | ```bash
24 | torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH
25 | ```
26 | 
27 | 最后，我们移除了 2D 压缩视频的重建损失，并训练 VAE 管道以构建 540k 步的 3D 视频。我们在 34 帧内使用随机数训练 VAE，使其对不同长度的视频更具鲁棒性。此阶段在 24 个 GPU 上进行训练。
28 | 
29 | ```bash
30 | torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH
31 | ```
32 | 
33 | 请注意，您需要根据自己的 csv 数据大小相应地调整配置文件中的 `epochs` 。
34 | 
35 | ## 推理
36 | 
37 | 为了直观地检查 VAE 的性能，您可以运行以下推理。它使用 `_ori` 后缀（即 `"YOUR_VIDEO_DIR"_ori`）将原始视频保存到您指定的视频目录中，使用`_rec`后缀（即`"YOUR_VIDEO_DIR"_rec`）将来自完整管道的重建视频保存到指定的视频目录中，并使用 `_spatial`后缀（即`"YOUR_VIDEO_DIR"_spatial`）将来自 2D 压缩和解压缩的重建视频保存到指定的视频目录中。
38 | 
39 | ```bash
40 | torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
41 | ```
42 | ## 评估
43 | 然后，我们可以计算 VAE 在 SSIM、PSNR、LPIPS 和 FLOLPIPS 指标上的表现得分。
44 | 
45 | * SSIM: 结构相似性指数度量，越高越好
46 | * PSNR: 峰值信噪比，越高越好
47 | * LPIPS: 学习感知图像质量下降，越低越好
48 | * [FloLPIPS](https://arxiv.org/pdf/2207.08119): 带有视频插值的LPIPS，越低越好。
49 | 
50 | ```bash
51 | python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
52 | ```
53 | 
54 | ## 致谢
55 | 我们非常感谢以下工作：
56 | * [MAGVIT-v2](https://arxiv.org/abs/2310.05737): Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation
57 | * [Taming Transformers](https://github.com/CompVis/taming-transformers): Taming Transformers for High-Resolution Image Synthesis
58 | * [3D blur pooling](https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc)
59 | * [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)
60 | 


--------------------------------------------------------------------------------
/eval/human_eval/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | set -e
 5 | 
 6 | TEXT_PATH=/home/data/sora_data/pixart-sigma-generated/text.txt
 7 | OUTPUT_PATH=/home/data/sora_data/pixart-sigma-generated/raw
 8 | CMD="python scripts/inference.py configs/pixart/inference/1x2048MS.py"
 9 | # LOG_BASE=logs/sample/generate
10 | LOG_BASE=$(dirname $CKPT)/eval/generate
11 | mkdir -p ${LOG_BASE}
12 | NUM_PER_GPU=10000
13 | N_LAUNCH=2
14 | NUM_START=$(($N_LAUNCH * $NUM_PER_GPU * 8))
15 | 
16 | CUDA_VISIBLE_DEVICES=0 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 0)) --end-index $(($NUM_START + $NUM_PER_GPU * 1)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_1.log 2>&1 &
17 | CUDA_VISIBLE_DEVICES=1 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 1)) --end-index $(($NUM_START + $NUM_PER_GPU * 2)) --image-size 1408 2816 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_2.log 2>&1 &
18 | CUDA_VISIBLE_DEVICES=2 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 2)) --end-index $(($NUM_START + $NUM_PER_GPU * 3)) --image-size 2816 1408 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_3.log 2>&1 &
19 | CUDA_VISIBLE_DEVICES=3 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 3)) --end-index $(($NUM_START + $NUM_PER_GPU * 4)) --image-size 1664 2304 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_4.log 2>&1 &
20 | CUDA_VISIBLE_DEVICES=4 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 4)) --end-index $(($NUM_START + $NUM_PER_GPU * 5)) --image-size 2304 1664 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_5.log 2>&1 &
21 | CUDA_VISIBLE_DEVICES=5 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 5)) --end-index $(($NUM_START + $NUM_PER_GPU * 6)) --image-size 1536 2560 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_6.log 2>&1 &
22 | CUDA_VISIBLE_DEVICES=6 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 6)) --end-index $(($NUM_START + $NUM_PER_GPU * 7)) --image-size 2560 1536 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_7.log 2>&1 &
23 | CUDA_VISIBLE_DEVICES=7 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 7)) --end-index $(($NUM_START + $NUM_PER_GPU * 8)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_8.log 2>&1 &
24 | 


--------------------------------------------------------------------------------
/eval/human_eval/launch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT=$1
 4 | NUM_FRAMES=$2
 5 | MODEL_NAME=$3
 6 | 
 7 | if [[ $CKPT == *"ema"* ]]; then
 8 |     parentdir=$(dirname $CKPT)
 9 |     CKPT_BASE=$(basename $parentdir)_ema
10 | else
11 |     CKPT_BASE=$(basename $CKPT)
12 | fi
13 | LOG_BASE=$(dirname $CKPT)/eval
14 | mkdir -p ${LOG_BASE}
15 | echo "Logging to $LOG_BASE"
16 | 
17 | GPUS=(0 1 2 3 4 5 6 7)
18 | # TASK_ID_LIST=(1 2a 2b 2c 2d 2e 2f 2g) # move image to video task
19 | TASK_ID_LIST=(2a 2b 2c 2d 2e 2f 2g 2h)
20 | # FRAME_LIST=(1 $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES)
21 | 
22 | for i in "${!GPUS[@]}"; do
23 |     CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -${TASK_ID_LIST[i]} >${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
24 | done
25 | 
26 | # kill all by: pkill -f "inference"
27 | 


--------------------------------------------------------------------------------
/eval/loss/launch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py"
 4 | CKPT_PATH=$1
 5 | MODEL_NAME=$2
 6 | IMG_PATH=$3
 7 | VID_PATH=$4
 8 | 
 9 | if [ -z $IMG_PATH ]; then
10 |     IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
11 | fi
12 | 
13 | if [ -z $VID_PATH ]; then
14 |     VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
15 | fi
16 | 
17 | if [[ $CKPT_PATH == *"ema"* ]]; then
18 |     parentdir=$(dirname $CKPT_PATH)
19 |     CKPT_BASE=$(basename $parentdir)_ema
20 | else
21 |     CKPT_BASE=$(basename $CKPT_PATH)
22 | fi
23 | LOG_BASE=$(dirname $CKPT_PATH)/eval
24 | mkdir -p $LOG_BASE
25 | echo "Logging to $LOG_BASE"
26 | 
27 | 
28 | GPUS=(3 4 5 6 7)
29 | RESOLUTION=(144p 240p 360p 480p 720p)
30 | 
31 | CUDA_VISIBLE_DEVICES=0 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 0 --end-index 5 >${LOG_BASE}/img_0.log 2>&1 &
32 | CUDA_VISIBLE_DEVICES=1 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 5 --end-index 6 >${LOG_BASE}/img_1.log 2>&1 &
33 | CUDA_VISIBLE_DEVICES=2 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 6 >${LOG_BASE}/img_2.log 2>&1 &
34 | 
35 | 
36 | for i in "${!GPUS[@]}"; do
37 |     CUDA_VISIBLE_DEVICES=${GPUS[i]} $CMD --data-path $VID_PATH --ckpt-path $CKPT_PATH --resolution ${RESOLUTION[i]} >${LOG_BASE}/${RESOLUTION[i]}_vid.log 2>&1 &
38 | done
39 | 


--------------------------------------------------------------------------------
/eval/loss/tabulate_rl_loss.py:
--------------------------------------------------------------------------------
 1 | """
 2 | usage:
 3 |     python tabulate_rl_loss.py --log_dir /home/zhengzangwei/projs/Open-Sora-dev/logs/loss --ckpt_name epoch0-global_step9000
 4 | 
 5 | save the processed json to:
 6 |     Open-Sora-dev/evaluation_results/rectified_flow/<ckpt_name>_loss.json
 7 | """
 8 | 
 9 | import argparse
10 | import json
11 | import os
12 | from ast import literal_eval
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument("--log_dir", type=str)
18 |     args = parser.parse_args()
19 |     return args
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     args = parse_args()
24 | 
25 |     files = os.listdir(args.log_dir)
26 |     files = [
27 |         "img_0.log",
28 |         "img_1.log",
29 |         "img_2.log",
30 |         "144p_vid.log",
31 |         "240p_vid.log",
32 |         "360p_vid.log",
33 |         "480p_vid.log",
34 |         "720p_vid.log",
35 |     ]
36 | 
37 |     loss_info = {}
38 | 
39 |     for fname in files:
40 |         path = os.path.join(args.log_dir, fname)
41 |         with open(path, "r", encoding="utf-8") as f:
42 |             content = f.readlines()
43 |         eval_line = content[-1].split("losses:")[-1].strip()
44 |         loss_dict = literal_eval(eval_line)
45 |         for key, loss in loss_dict.items():
46 |             resolution, frame = key
47 |             if resolution not in loss_info:
48 |                 loss_info[resolution] = {}
49 |             loss_info[resolution][frame] = format(loss, ".4f")
50 | 
51 |     # Convert and write JSON object to file
52 |     output_file_path = os.path.join(args.log_dir, "loss.json")
53 |     with open(output_file_path, "w") as outfile:
54 |         json.dump(loss_info, outfile, indent=4, sort_keys=True)
55 |     print(f"results saved to: {output_file_path}")
56 | 


--------------------------------------------------------------------------------
/eval/vae/cal_psnr.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | def img_psnr(img1, img2):
 9 |     # [0,1]
10 |     # compute mse
11 |     # mse = np.mean((img1-img2)**2)
12 |     mse = np.mean((img1 / 1.0 - img2 / 1.0) ** 2)
13 |     # compute psnr
14 |     if mse < 1e-10:
15 |         return 100
16 |     psnr = 20 * math.log10(1 / math.sqrt(mse))
17 |     return psnr
18 | 
19 | 
20 | def trans(x):
21 |     return x
22 | 
23 | 
24 | def calculate_psnr(videos1, videos2):
25 |     print("calculate_psnr...")
26 | 
27 |     # videos [batch_size, timestamps, channel, h, w]
28 | 
29 |     assert videos1.shape == videos2.shape
30 | 
31 |     videos1 = trans(videos1)
32 |     videos2 = trans(videos2)
33 | 
34 |     psnr_results = []
35 | 
36 |     for video_num in tqdm(range(videos1.shape[0])):
37 |         # get a video
38 |         # video [timestamps, channel, h, w]
39 |         video1 = videos1[video_num]
40 |         video2 = videos2[video_num]
41 | 
42 |         psnr_results_of_a_video = []
43 |         for clip_timestamp in range(len(video1)):
44 |             # get a img
45 |             # img [timestamps[x], channel, h, w]
46 |             # img [channel, h, w] numpy
47 | 
48 |             img1 = video1[clip_timestamp].numpy()
49 |             img2 = video2[clip_timestamp].numpy()
50 | 
51 |             # calculate psnr of a video
52 |             psnr_results_of_a_video.append(img_psnr(img1, img2))
53 | 
54 |         psnr_results.append(psnr_results_of_a_video)
55 | 
56 |     psnr_results = np.array(psnr_results)  # [batch_size, num_frames]
57 |     psnr = {}
58 |     psnr_std = {}
59 | 
60 |     for clip_timestamp in range(len(video1)):
61 |         psnr[clip_timestamp] = np.mean(psnr_results[:, clip_timestamp])
62 |         psnr_std[clip_timestamp] = np.std(psnr_results[:, clip_timestamp])
63 | 
64 |     result = {
65 |         "value": psnr,
66 |         "value_std": psnr_std,
67 |         "video_setting": video1.shape,
68 |         "video_setting_name": "time, channel, heigth, width",
69 |     }
70 | 
71 |     return result
72 | 
73 | 
74 | # test code / using example
75 | 
76 | 
77 | def main():
78 |     NUMBER_OF_VIDEOS = 8
79 |     VIDEO_LENGTH = 50
80 |     CHANNEL = 3
81 |     SIZE = 64
82 |     videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
83 |     videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
84 | 
85 |     import json
86 | 
87 |     result = calculate_psnr(videos1, videos2)
88 |     print(json.dumps(result, indent=4))
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     main()
93 | 


--------------------------------------------------------------------------------
/eval/vae/script/eval.sh:
--------------------------------------------------------------------------------
 1 | python eval/eval_common_metric.py \
 2 |     --batch_size 2 \
 3 |     --real_video_dir ../test_eval/release/origin \
 4 |     --generated_video_dir ../test_eval/release \
 5 |     --device cuda \
 6 |     --sample_fps 10 \
 7 |     --crop_size 256 \
 8 |     --resolution 256 \
 9 |     --num_frames 17 \
10 |     --sample_rate 1 \
11 |     --subset_size 100 \
12 |     --metric ssim psnr lpips flolpips
13 | 


--------------------------------------------------------------------------------
/eval/vbench/calc_vbench.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import time
 4 | 
 5 | import torch
 6 | 
 7 | from vbench import VBench
 8 | 
 9 | full_info_path = "eval/vbench/VBench_full_info.json"
10 | dimensions = [
11 |     # a: 10min
12 |     "subject_consistency",  # 4min
13 |     "imaging_quality",  # 6min
14 |     # b: 12min
15 |     "background_consistency",  # 2min
16 |     "motion_smoothness",  # 5min
17 |     "overall_consistency",  # 2min
18 |     "human_action",  # 3min
19 |     # c: 14min
20 |     "multiple_objects",  # 14min
21 |     # d: 14min
22 |     "spatial_relationship",  # 14min
23 |     # e: 12min
24 |     "object_class",  # 12min
25 |     # f: 12min
26 |     "color",  # 12min
27 |     # g: 10.5min
28 |     "aesthetic_quality",  # 2.5min
29 |     "appearance_style",  # 6min
30 |     "temporal_flickering",  # 2min
31 |     # h: 9min
32 |     "scene",  # 3min
33 |     "temporal_style",  # 2min
34 |     "dynamic_degree",  # 4min
35 | ]
36 | 
37 | 
38 | def parse_args():
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument("video_folder", type=str)  # samples/samples..._vbench/eval
41 |     parser.add_argument("model_ckpt", type=str)
42 |     parser.add_argument("--start", type=int, default=0)  # start index of dimension to be evaluated
43 |     parser.add_argument("--end", type=int, default=-1)  # start index of dimension to be evaluated
44 | 
45 |     args = parser.parse_args()
46 |     return args
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     args = parse_args()
51 |     output_dir = os.path.join(args.model_ckpt, "vbench")
52 |     os.makedirs(output_dir, exist_ok=True)
53 |     video_path = args.video_folder
54 | 
55 |     kwargs = {}
56 |     kwargs["imaging_quality_preprocessing_mode"] = "longer"  # use VBench/evaluate.py default
57 | 
58 |     start_time = time.time()
59 | 
60 |     # NOTE: important to use torch.device("cuda"), else will have issue with object_class third_party module
61 |     my_VBench = VBench(torch.device("cuda"), full_info_path, output_dir)
62 |     if args.end == -1:  # adjust end accordingly
63 |         args.end = len(dimensions)
64 |     for dim in dimensions[args.start : args.end]:
65 |         my_VBench.evaluate(
66 |             videos_path=video_path,
67 |             name=dim,
68 |             local=False,
69 |             read_frame=False,
70 |             dimension_list=[dim],
71 |             mode="vbench_standard",
72 |             **kwargs,
73 |         )
74 | 
75 |     print("Runtime: %s seconds " % (time.time() - start_time))
76 | 


--------------------------------------------------------------------------------
/eval/vbench/launch.sh:
--------------------------------------------------------------------------------
 1 | # !/bin/bash
 2 | 
 3 | CKPT=$1
 4 | NUM_FRAMES=$2
 5 | MODEL_NAME=$3
 6 | RES=$4
 7 | ASP_RATIO=$5
 8 | 
 9 | NUM_SAMPLING_STEPS=$6
10 | FLOW=$7
11 | LLM_REFINE=$8
12 | 
13 | if [[ $CKPT == *"ema"* ]]; then
14 |     parentdir=$(dirname $CKPT)
15 |     CKPT_BASE=$(basename $parentdir)_ema
16 | else
17 |     CKPT_BASE=$(basename $CKPT)
18 | fi
19 | LOG_BASE=$(dirname $CKPT)/eval
20 | echo "Logging to $LOG_BASE"
21 | 
22 | GPUS=(0 1 2 3 4 5 6 7)
23 | TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only
24 | START_INDEX_LIST=(0 120 240 360 480 600 720 840)
25 | END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
26 | 
27 | ## Modify the following to run on multiple machines for faster results
28 | ## 720p will take quite long on a single machine
29 | # START_INDEX_LIST=(60 180 300 420 540 660 780 900)
30 | # END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
31 | # LOG_BASE=$(dirname $CKPT)/eval/last_60
32 | # mkdir -p ${LOG_BASE}
33 | # echo "Logging to $LOG_BASE"
34 | 
35 | 
36 | 
37 | for i in "${!GPUS[@]}"; do
38 |     if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
39 |         then
40 |             CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
41 |         else
42 |             if [ -z ${NUM_SAMPLING_STEPS} ];
43 |                 then
44 |                     CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
45 |                 else
46 |                     if [ -z ${FLOW} ];
47 |                     then
48 |                         CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
49 |                     else
50 |                         if [ -z ${LLM_REFINE} ];
51 |                             then
52 |                                 CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
53 |                             else
54 |                                 CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
55 |                         fi
56 |                     fi
57 |             fi
58 |     fi
59 | done
60 | 


--------------------------------------------------------------------------------
/eval/vbench/launch_calc.sh:
--------------------------------------------------------------------------------
 1 | # !/bin/bash
 2 | 
 3 | VIDEO_DIR=$1
 4 | CKPT_DIR=$2
 5 | LOG_BASE=$CKPT_DIR
 6 | mkdir -p $LOG_BASE
 7 | echo "Logging to $LOG_BASE"
 8 | 
 9 | GPUS=(0 1 2 3 4 5 6 7)
10 | START_INDEX_LIST=(0 2 6 7 8 9 10 13)
11 | END_INDEX_LIST=(2 6 7 8 9 10 13 16)
12 | TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only
13 | 
14 | for i in "${!GPUS[@]}"; do
15 |     CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
16 | done
17 | 


--------------------------------------------------------------------------------
/eval/vbench_i2v/json_to_txt.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | RESOLUTIONS = ["1-1", "16-9", "7-4", "8-5"]
 5 | 
 6 | cache_root = "/mnt/jfs-hdd/sora/data/vbench-i2v/crop"
 7 | resolution = RESOLUTIONS[0]
 8 | json_file = "vbench2_i2v_full_info.json"
 9 | save_path = "all_i2v.txt"
10 | 
11 | data = json.load(open(json_file))
12 | txt = [
13 |     f'{x["prompt_en"]}{{"reference_path": "{os.path.join(cache_root, resolution, x["image_name"])}", "mask_strategy": "0"}}'
14 |     for x in data
15 | ]
16 | with open(save_path, "w") as f:
17 |     f.write("\n".join(txt))
18 | 


--------------------------------------------------------------------------------
/eval/vbench_i2v/launch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT=$1
 4 | NUM_FRAMES=$2
 5 | MODEL_NAME=$3
 6 | RES=$4
 7 | ASP_RATIO=$5
 8 | 
 9 | NUM_SAMPLING_STEPS=$6
10 | FLOW=$7
11 | LLM_REFINE=$8
12 | 
13 | if [[ $CKPT == *"ema"* ]]; then
14 |     parentdir=$(dirname $CKPT)
15 |     CKPT_BASE=$(basename $parentdir)_ema
16 | else
17 |     CKPT_BASE=$(basename $CKPT)
18 | fi
19 | LOG_BASE=$(dirname $CKPT)/eval
20 | echo "Logging to $LOG_BASE"
21 | 
22 | GPUS=(0 1 2 3 4 5 6 7)
23 | TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
24 | START_INDEX_LIST=(0 140 280 420 560 700 840 980)
25 | END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
26 | 
27 | 
28 | for i in "${!GPUS[@]}"; do
29 |     if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
30 |         then
31 |             CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
32 |         else
33 |             if [ -z ${NUM_SAMPLING_STEPS} ];
34 |                 then
35 |                     CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
36 |                 else
37 |                     if [ -z ${FLOW} ];
38 |                     then
39 |                         CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
40 |                     else
41 |                         if [ -z ${LLM_REFINE} ];
42 |                             then
43 |                                 CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
44 |                             else
45 |                                 CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
46 |                         fi
47 |                     fi
48 |             fi
49 |     fi
50 | done
51 | 


--------------------------------------------------------------------------------
/eval/vbench_i2v/launch_calc.sh:
--------------------------------------------------------------------------------
 1 | # !/bin/bash
 2 | 
 3 | VIDEO_DIR=$1
 4 | CKPT_DIR=$2
 5 | LOG_BASE=$CKPT_DIR
 6 | mkdir -p $LOG_BASE
 7 | echo "Logging to $LOG_BASE"
 8 | 
 9 | GPUS=(0 1 2 3 4 5 6 7)
10 | CALC_I2V_LIST=(True True False False False False False False)
11 | CALC_QUALITY_LIST=(False False True True True True True True)
12 | START_INDEX_LIST=(0 2 0 2 3 4 5 6)
13 | END_INDEX_LIST=(2 -1 2 3 4 5 6 -1)
14 | TASK_ID_LIST=(calc_vbench_i2v_a calc_vbench_i2v_b calc_vbench_i2v_c calc_vbench_i2v_d calc_vbench_i2v_e calc_vbench_i2v_f calc_vbench_i2v_g calc_vbench_i2v_h) # for log records only
15 | 
16 | 
17 | for i in "${!GPUS[@]}"; do
18 |     CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench_i2v/calc_vbench_i2v.py $VIDEO_DIR $CKPT_DIR --calc_i2v ${CALC_I2V_LIST[i]} --calc_quality ${CALC_QUALITY_LIST[i]} --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
19 | done
20 | 


--------------------------------------------------------------------------------
/gradio/requirements.txt:
--------------------------------------------------------------------------------
1 | xformers
2 | transformers
3 | git+https://github.com/hpcaitech/Open-Sora.git
4 | 


--------------------------------------------------------------------------------
/kill_process.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the input file is provided
 4 | if [ $# -ne 1 ]; then
 5 |   echo "Usage: $0 <input_file>"
 6 |   exit 1
 7 | fi
 8 | 
 9 | 
10 | input_file="$1"
11 | 
12 | # Check if the input file exists
13 | if [ ! -f "$input_file" ]; then
14 |   echo "Error: Input file '$input_file' does not exist."
15 |   exit 1
16 | fi
17 | 
18 | while IFS= read -r hostname || [ -n "$hostname" ]; do
19 |   if [ -n "$hostname" ]; then
20 |     echo "Sending 'sudo pkill -f python.*train\.py' to $hostname"
21 |     ssh "$hostname" "sudo pkill -f python.*train\.py" &
22 |   fi
23 | done < "$input_file"
24 | 


--------------------------------------------------------------------------------
/nvtop_all.py:
--------------------------------------------------------------------------------
 1 | import concurrent.futures
 2 | import subprocess
 3 | import pandas as pd
 4 | import sys
 5 | 
 6 | def get_gpu_info(node_name):
 7 |     try:
 8 |         result_processes = subprocess.run(
 9 |             ["ssh", node_name, "nvidia-smi --query-compute-apps=pid --format=csv,noheader | wc -l"],
10 |             capture_output=True,
11 |             text=True,
12 |             check=True
13 |         )
14 |         num_processes = int(result_processes.stdout.strip())
15 |         
16 |         result_power = subprocess.run(
17 |             ["ssh", node_name, "nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits"],
18 |             capture_output=True,
19 |             text=True,
20 |             check=True
21 |         )
22 |         power_draws = [float(p.strip()) for p in result_power.stdout.splitlines()]
23 |         mean_power = sum(power_draws) / len(power_draws) if power_draws else 0.0
24 |         
25 |         return node_name, num_processes, mean_power
26 |     except subprocess.CalledProcessError as e:
27 |         return node_name, "Failed", "Failed"
28 | 
29 | def main(hostfile):
30 |     with open(hostfile, 'r') as file:
31 |         nodes = [line.strip() for line in file if line.strip()]
32 |         
33 |     with concurrent.futures.ThreadPoolExecutor() as executor:
34 |         futures = [executor.submit(get_gpu_info, node) for node in nodes]
35 |         results = [future.result() for future in concurrent.futures.as_completed(futures)]
36 |         
37 |     df = pd.DataFrame(results, columns=["Node", "GPU Processes", "Mean Power Consumption (W)"])
38 | 
39 |     # Calculate mean values for GPU Processes and Mean Power Consumption
40 |     mean_gpu_processes = df["GPU Processes"].replace("Failed", float('nan')).astype(float).mean()
41 |     mean_power_consumption = df["Mean Power Consumption (W)"].replace("Failed", float('nan')).astype(float).mean()
42 |     #df.loc["Mean"] = ["", mean_gpu_processes, mean_power_consumption]
43 | 
44 |     # Set pandas options to display the entire DataFrame
45 |     pd.set_option('display.max_rows', None)
46 |     pd.set_option('display.max_columns', None)
47 |     pd.set_option('display.width', None)
48 |     pd.set_option('display.max_colwidth', None)
49 |     
50 |     print(df)
51 | 
52 | if __name__ == "__main__":
53 |     if len(sys.argv) != 2:
54 |         print("Usage: python script.py <hostfile>")
55 |         sys.exit(1)
56 |     
57 |     hostfile = sys.argv[1]
58 |     main(hostfile)
59 | 
60 | 


--------------------------------------------------------------------------------
/opensora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/__init__.py


--------------------------------------------------------------------------------
/opensora/acceleration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/acceleration/__init__.py


--------------------------------------------------------------------------------
/opensora/acceleration/checkpoint.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Iterable
 2 | 
 3 | import torch.nn as nn
 4 | from torch.utils.checkpoint import checkpoint, checkpoint_sequential
 5 | 
 6 | 
 7 | def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1):
 8 |     assert isinstance(model, nn.Module)
 9 | 
10 |     def set_attr(module):
11 |         module.grad_checkpointing = True
12 |         module.fp32_attention = use_fp32_attention
13 |         module.grad_checkpointing_step = gc_step
14 | 
15 |     model.apply(set_attr)
16 | 
17 | 
18 | def auto_grad_checkpoint(module, *args, **kwargs):
19 |     if getattr(module, "grad_checkpointing", False):
20 |         if not isinstance(module, Iterable):
21 |             return checkpoint(module, *args, use_reentrant=False, **kwargs)
22 |         gc_step = module[0].grad_checkpointing_step
23 |         return checkpoint_sequential(module, gc_step, *args, use_reentrant=False, **kwargs)
24 |     return module(*args, **kwargs)
25 | 


--------------------------------------------------------------------------------
/opensora/acceleration/parallel_states.py:
--------------------------------------------------------------------------------
 1 | import torch.distributed as dist
 2 | 
 3 | _GLOBAL_PARALLEL_GROUPS = dict()
 4 | 
 5 | 
 6 | def set_data_parallel_group(group: dist.ProcessGroup):
 7 |     _GLOBAL_PARALLEL_GROUPS["data"] = group
 8 | 
 9 | 
10 | def get_data_parallel_group():
11 |     return _GLOBAL_PARALLEL_GROUPS.get("data", dist.group.WORLD)
12 | 
13 | 
14 | def set_sequence_parallel_group(group: dist.ProcessGroup):
15 |     _GLOBAL_PARALLEL_GROUPS["sequence"] = group
16 | 
17 | 
18 | def get_sequence_parallel_group():
19 |     return _GLOBAL_PARALLEL_GROUPS.get("sequence", None)
20 | 


--------------------------------------------------------------------------------
/opensora/acceleration/shardformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/acceleration/shardformer/__init__.py


--------------------------------------------------------------------------------
/opensora/acceleration/shardformer/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/acceleration/shardformer/modeling/__init__.py


--------------------------------------------------------------------------------
/opensora/acceleration/shardformer/modeling/t5.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class T5LayerNorm(nn.Module):
 6 |     def __init__(self, hidden_size, eps=1e-6):
 7 |         """
 8 |         Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
 9 |         """
10 |         super().__init__()
11 |         self.weight = nn.Parameter(torch.ones(hidden_size))
12 |         self.variance_epsilon = eps
13 | 
14 |     def forward(self, hidden_states):
15 |         # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
16 |         # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
17 |         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
18 |         # half-precision inputs is done in fp32
19 | 
20 |         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
21 |         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
22 | 
23 |         # convert into half-precision if necessary
24 |         if self.weight.dtype in [torch.float16, torch.bfloat16]:
25 |             hidden_states = hidden_states.to(self.weight.dtype)
26 | 
27 |         return self.weight * hidden_states
28 | 
29 |     @staticmethod
30 |     def from_native_module(module, *args, **kwargs):
31 |         assert module.__class__.__name__ == "FusedRMSNorm", (
32 |             "Recovering T5LayerNorm requires the original layer to be apex's Fused RMS Norm."
33 |             "Apex's fused norm is automatically used by Hugging Face Transformers https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L265C5-L265C48"
34 |         )
35 | 
36 |         layer_norm = T5LayerNorm(module.normalized_shape, eps=module.eps)
37 |         layer_norm.weight.data.copy_(module.weight.data)
38 |         layer_norm = layer_norm.to(module.weight.device)
39 |         return layer_norm
40 | 


--------------------------------------------------------------------------------
/opensora/acceleration/shardformer/policy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/acceleration/shardformer/policy/__init__.py


--------------------------------------------------------------------------------
/opensora/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .datasets import IMG_FPS, BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset
2 | from .utils import get_transforms_image, get_transforms_video, is_img, is_vid, save_sample
3 | 


--------------------------------------------------------------------------------
/opensora/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .dit import *
2 | from .latte import *
3 | from .pixart import *
4 | from .stdit import *
5 | from .text_encoder import *
6 | from .vae import *
7 | 


--------------------------------------------------------------------------------
/opensora/models/dit/__init__.py:
--------------------------------------------------------------------------------
1 | from .dit import DiT, DiT_XL_2, DiT_XL_2x2
2 | 


--------------------------------------------------------------------------------
/opensora/models/latte/__init__.py:
--------------------------------------------------------------------------------
1 | from .latte import Latte, Latte_XL_2, Latte_XL_2x2
2 | 


--------------------------------------------------------------------------------
/opensora/models/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/models/layers/__init__.py


--------------------------------------------------------------------------------
/opensora/models/pixart/__init__.py:
--------------------------------------------------------------------------------
1 | from .pixart import PixArt, PixArt_1B_2, PixArt_XL_2
2 | from .pixart_sigma import PixArt_Sigma_XL_2
3 | 


--------------------------------------------------------------------------------
/opensora/models/stdit/__init__.py:
--------------------------------------------------------------------------------
1 | from .stdit import STDiT
2 | from .stdit2 import STDiT2
3 | from .stdit3 import STDiT3
4 | 


--------------------------------------------------------------------------------
/opensora/models/text_encoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .classes import ClassEncoder
2 | from .clip import ClipEncoder
3 | from .t5 import T5Encoder
4 | 


--------------------------------------------------------------------------------
/opensora/models/text_encoder/classes.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from opensora.registry import MODELS
 4 | 
 5 | 
 6 | @MODELS.register_module("classes")
 7 | class ClassEncoder:
 8 |     def __init__(self, num_classes, model_max_length=None, device="cuda", dtype=torch.float):
 9 |         self.num_classes = num_classes
10 |         self.y_embedder = None
11 | 
12 |         self.model_max_length = model_max_length
13 |         self.output_dim = None
14 |         self.device = device
15 | 
16 |     def encode(self, text):
17 |         return dict(y=torch.tensor([int(t) for t in text]).to(self.device))
18 | 
19 |     def null(self, n):
20 |         return torch.tensor([self.num_classes] * n).to(self.device)
21 | 


--------------------------------------------------------------------------------
/opensora/models/vae/__init__.py:
--------------------------------------------------------------------------------
1 | from .discriminator import DISCRIMINATOR_3D
2 | from .vae import VideoAutoencoderKL, VideoAutoencoderKLTemporalDecoder
3 | from .vae_temporal import VAE_Temporal
4 | 


--------------------------------------------------------------------------------
/opensora/models/vae/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | """Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""
 5 | 
 6 | 
 7 | class DiagonalGaussianDistribution(object):
 8 |     def __init__(
 9 |         self,
10 |         parameters,
11 |         deterministic=False,
12 |     ):
13 |         self.parameters = parameters
14 |         self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
15 |         self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
16 |         self.deterministic = deterministic
17 |         self.std = torch.exp(0.5 * self.logvar)
18 |         self.var = torch.exp(self.logvar)
19 |         if self.deterministic:
20 |             self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device, dtype=self.mean.dtype)
21 | 
22 |     def sample(self):
23 |         # torch.randn: standard normal distribution
24 |         x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device, dtype=self.mean.dtype)
25 |         return x
26 | 
27 |     def kl(self, other=None):
28 |         if self.deterministic:
29 |             return torch.Tensor([0.0])
30 |         else:
31 |             if other is None:  # SCH: assumes other is a standard normal distribution
32 |                 return 0.5 * torch.sum(torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, dim=[1, 2, 3, 4])
33 |             else:
34 |                 return 0.5 * torch.sum(
35 |                     torch.pow(self.mean - other.mean, 2) / other.var
36 |                     + self.var / other.var
37 |                     - 1.0
38 |                     - self.logvar
39 |                     + other.logvar,
40 |                     dim=[1, 2, 3, 4],
41 |                 )
42 | 
43 |     def nll(self, sample, dims=[1, 2, 3, 4]):
44 |         if self.deterministic:
45 |             return torch.Tensor([0.0])
46 |         logtwopi = np.log(2.0 * np.pi)
47 |         return 0.5 * torch.sum(logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, dim=dims)
48 | 
49 |     def mode(self):
50 |         return self.mean
51 | 


--------------------------------------------------------------------------------
/opensora/registry.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | import torch.nn as nn
 4 | from mmengine.registry import Registry
 5 | 
 6 | 
 7 | def build_module(module, builder, **kwargs):
 8 |     """Build module from config or return the module itself.
 9 | 
10 |     Args:
11 |         module (Union[dict, nn.Module]): The module to build.
12 |         builder (Registry): The registry to build module.
13 |         *args, **kwargs: Arguments passed to build function.
14 | 
15 |     Returns:
16 |         Any: The built module.
17 |     """
18 |     if module is None:
19 |         return None
20 |     if isinstance(module, dict):
21 |         cfg = deepcopy(module)
22 |         for k, v in kwargs.items():
23 |             cfg[k] = v
24 |         return builder.build(cfg)
25 |     elif isinstance(module, nn.Module):
26 |         return module
27 |     elif module is None:
28 |         return None
29 |     else:
30 |         raise TypeError(f"Only support dict and nn.Module, but got {type(module)}.")
31 | 
32 | 
33 | MODELS = Registry(
34 |     "model",
35 |     locations=["opensora.models"],
36 | )
37 | 
38 | SCHEDULERS = Registry(
39 |     "scheduler",
40 |     locations=["opensora.schedulers"],
41 | )
42 | 
43 | DATASETS = Registry(
44 |     "dataset",
45 |     locations=["opensora.datasets"],
46 | )
47 | 


--------------------------------------------------------------------------------
/opensora/schedulers/__init__.py:
--------------------------------------------------------------------------------
1 | from .dpms import DPMS
2 | from .iddpm import IDDPM
3 | from .rf import RFLOW
4 | 


--------------------------------------------------------------------------------
/opensora/schedulers/dpms/__init__.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import torch
 4 | 
 5 | from opensora.registry import SCHEDULERS
 6 | 
 7 | from .dpm_solver import DPMS
 8 | 
 9 | 
10 | @SCHEDULERS.register_module("dpm-solver")
11 | class DPM_SOLVER:
12 |     def __init__(self, num_sampling_steps=None, cfg_scale=4.0):
13 |         self.num_sampling_steps = num_sampling_steps
14 |         self.cfg_scale = cfg_scale
15 | 
16 |     def sample(
17 |         self,
18 |         model,
19 |         text_encoder,
20 |         z,
21 |         prompts,
22 |         device,
23 |         additional_args=None,
24 |         mask=None,
25 |         progress=True,
26 |     ):
27 |         if mask is not None:
28 |             print("[WARNING] mask is not supported in dpm-solver, it will be ignored")
29 |         n = len(prompts)
30 |         model_args = text_encoder.encode(prompts)
31 |         y = model_args.pop("y")
32 |         null_y = text_encoder.null(n)
33 |         if additional_args is not None:
34 |             model_args.update(additional_args)
35 | 
36 |         dpms = DPMS(
37 |             partial(forward_with_dpmsolver, model),
38 |             condition=y,
39 |             uncondition=null_y,
40 |             cfg_scale=self.cfg_scale,
41 |             model_kwargs=model_args,
42 |         )
43 |         samples = dpms.sample(
44 |             z,
45 |             steps=self.num_sampling_steps,
46 |             order=2,
47 |             skip_type="time_uniform",
48 |             method="multistep",
49 |             progress=progress,
50 |         )
51 |         return samples
52 | 
53 | 
54 | def forward_with_dpmsolver(self, x, timestep, y, **kwargs):
55 |     """
56 |     dpm solver donnot need variance prediction
57 |     """
58 |     # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
59 |     model_out = self.forward(x, timestep, y, **kwargs)
60 |     return model_out.chunk(2, dim=1)[0]
61 | 


--------------------------------------------------------------------------------
/opensora/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/utils/__init__.py


--------------------------------------------------------------------------------
/requirements/requirements-cu121.txt:
--------------------------------------------------------------------------------
1 | torch==2.2.2 --index-url https://download.pytorch.org/whl/cu121
2 | torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu121
3 | xformers==0.0.25.post1 --index-url https://download.pytorch.org/whl/cu121
4 | 


--------------------------------------------------------------------------------
/requirements/requirements-data.txt:
--------------------------------------------------------------------------------
 1 | gdown>=5.2.0
 2 | 
 3 | # [caption llava]
 4 | ninja>=1.11.1.1
 5 | shortuuid>=1.0.13
 6 | markdown2[all]
 7 | scikit-learn>=1.4.2
 8 | einops-exts>=0.0.4
 9 | 
10 | # [camera_motion]
11 | decord==0.6.0
12 | ptvsd==4.3.2
13 | imageio-ffmpeg>=0.4.9
14 | 
15 | # [datasets]
16 | ffmpeg-python==0.2.0
17 | lingua-language-detector==2.0.2
18 | 
19 | # [frame interpolation]
20 | imageio>=2.34.1
21 | 
22 | # [aesthetic]
23 | setuptools==68.2.2
24 | clip @ git+https://github.com/openai/CLIP.git
25 | 
26 | # [ocr]
27 | mmcv==2.1.0
28 | mmdet==3.1.0
29 | mmocr==1.0.1
30 | detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992
31 | 


--------------------------------------------------------------------------------
/requirements/requirements-eval.txt:
--------------------------------------------------------------------------------
 1 | # [vbench]
 2 | detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992
 3 | imageio>=2.34.1
 4 | pyiqa==0.1.10
 5 | scikit-learn>=1.4.2
 6 | scikit-image>=0.20.0
 7 | lvis==0.5.3
 8 | boto3>=1.34.113
 9 | easydict>=1.9
10 | fairscale>=0.4.13
11 | 
12 | # [vae]
13 | decord==0.6.0
14 | pytorchvideo==0.1.5
15 | lpips==0.1.4
16 | 


--------------------------------------------------------------------------------
/requirements/requirements-vae.txt:
--------------------------------------------------------------------------------
1 | beartype==0.18.5
2 | einops==0.8.0
3 | einops-exts==0.0.4
4 | opencv-python==4.9.0.80
5 | pillow==10.3.0
6 | 


--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | colossalai>=0.4.0
 2 | mmengine>=0.10.3
 3 | pandas>=2.0.3
 4 | timm==0.9.16
 5 | rotary_embedding_torch==0.5.3
 6 | ftfy>=6.2.0 # for t5
 7 | diffusers==0.27.2 # for vae
 8 | accelerate==0.29.2 # for t5
 9 | av>=12.0.0 # for video loading
10 | numpy<2.0.0
11 | 
12 | # [gradio]
13 | gradio>=4.26.0
14 | spaces>=0.28.3
15 | 
16 | # [notebook]
17 | ipykernel>=6.29.4
18 | ipywidgets>=8.1.2
19 | 
20 | # [training]
21 | wandb>=0.17.0
22 | tensorboard>=2.14.0
23 | pandarallel>=1.6.5
24 | pyarrow>=16.1.0 # for parquet
25 | 
26 | # [dev]
27 | pre-commit>=3.5.0
28 | openai
29 | 


--------------------------------------------------------------------------------
/scripts/clear_cache.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the input file is provided
 4 | if [ $# -ne 1 ]; then
 5 |   echo "Usage: $0 <input_file>"
 6 |   exit 1
 7 | fi
 8 | 
 9 | 
10 | input_file="$1"
11 | 
12 | # Check if the input file exists
13 | if [ ! -f "$input_file" ]; then
14 |   echo "Error: Input file '$input_file' does not exist."
15 |   exit 1
16 | fi
17 | 
18 | while IFS= read -r hostname || [ -n "$hostname" ]; do
19 |   if [ -n "$hostname" ]; then
20 |     ssh "$hostname" "rm -rf /home/ubuntu/.cache/colossalai/" &
21 |   fi
22 | done < "$input_file"
23 | 
24 | 


--------------------------------------------------------------------------------
/scripts/misc/launch_extract_feat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | set -e
 5 | 
 6 | START_SPLIT=0
 7 | NUM_SPLIT=10
 8 | 
 9 | DATA_PATH=$1
10 | SAVE_PATH=$2
11 | DATA_ARG="--data-path $DATA_PATH"
12 | SAVE_ARG="--save-dir $SAVE_PATH"
13 | 
14 | CMD="torchrun --standalone --nproc_per_node 1 scripts/misc/extract_feat.py configs/opensora-v1-2/misc/extract.py $DATA_ARG $SAVE_ARG"
15 | declare -a GPUS=(0 1 2 3 4 5 6 7)
16 | 
17 | mkdir -p logs/extract_feat
18 | 
19 | for i in "${GPUS[@]}"; do
20 |     CUDA_VISIBLE_DEVICES=$i $CMD --start-index $(($START_SPLIT + i * $NUM_SPLIT)) --end-index $(($START_SPLIT + (i + 1) * $NUM_SPLIT)) >logs/extract_feat/$i.log 2>&1 &
21 | done
22 | 


--------------------------------------------------------------------------------
/scripts/misc/launch_search_bs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | set -e
 5 | 
 6 | CMD="torchrun --standalone --nproc_per_node 1 scripts/misc/search_bs.py configs/opensora-v1-2/misc/bs.py"
 7 | DATA_PATH="/mnt/nfs-207/sora_data/meta/searchbs.csv"
 8 | 
 9 | LOG_BASE=logs/search_bs
10 | mkdir -p logs/search_bs
11 | echo "Logging to $LOG_BASE"
12 | 
13 | CUDA_VISIBLE_DEVICES=0 $CMD --data-path $DATA_PATH --resolution 144p >${LOG_BASE}/144p.log 2>&1 &
14 | CUDA_VISIBLE_DEVICES=1 $CMD --data-path $DATA_PATH --resolution 240p >${LOG_BASE}/240p.log 2>&1 &
15 | CUDA_VISIBLE_DEVICES=2 $CMD --data-path $DATA_PATH --resolution 512 >${LOG_BASE}/512.log 2>&1 &
16 | CUDA_VISIBLE_DEVICES=3 $CMD --data-path $DATA_PATH --resolution 480p >${LOG_BASE}/480p.log 2>&1 &
17 | CUDA_VISIBLE_DEVICES=4 $CMD --data-path $DATA_PATH --resolution 1024 >${LOG_BASE}/1024.log 2>&1 &
18 | CUDA_VISIBLE_DEVICES=5 $CMD --data-path $DATA_PATH --resolution 1080p >${LOG_BASE}/1080p.log 2>&1 &
19 | CUDA_VISIBLE_DEVICES=6 $CMD --data-path $DATA_PATH --resolution 2048 >${LOG_BASE}/2048.log 2>&1 &
20 | 


--------------------------------------------------------------------------------
/tests/test_attn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from colossalai.accelerator import get_accelerator
 3 | from colossalai.utils import get_current_device
 4 | from rotary_embedding_torch import RotaryEmbedding
 5 | 
 6 | from opensora.models.layers.blocks import Attention
 7 | 
 8 | # B, S, H = 7488, 1, 1152
 9 | # B, S, H = 32, 234, 1152
10 | B, S, H = 128, 32, 1152
11 | N, D = 16, 72
12 | 
13 | 
14 | def run_attn(enable_flash_attn: bool):
15 |     get_accelerator().reset_peak_memory_stats()
16 |     rope = RotaryEmbedding(D).to(device=get_current_device(), dtype=torch.bfloat16)
17 |     attn = Attention(
18 |         H,
19 |         N,
20 |         qkv_bias=True,
21 |         rope=rope.rotate_queries_or_keys,
22 |         enable_flash_attn=enable_flash_attn,
23 |     ).to(device=get_current_device(), dtype=torch.bfloat16)
24 |     x = torch.randn(B, S, H, device=get_current_device(), dtype=torch.bfloat16).requires_grad_()
25 |     y = attn(x)
26 |     y.mean().backward()
27 |     print(f"Peak memory: {get_accelerator().max_memory_allocated() / 1024**2:.2f} MB")
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     print("Use flashattn")
32 |     run_attn(True)
33 |     print("No flashattn")
34 |     run_attn(False)
35 | 


--------------------------------------------------------------------------------
/tests/test_lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Adam
 3 | from torchvision.models import resnet50
 4 | from tqdm import tqdm
 5 | 
 6 | from opensora.utils.lr_scheduler import LinearWarmupLR
 7 | 
 8 | 
 9 | def test_lr_scheduler():
10 |     warmup_steps = 200
11 |     model = resnet50().cuda()
12 |     optimizer = Adam(model.parameters(), lr=0.01)
13 |     scheduler = LinearWarmupLR(optimizer, warmup_steps=warmup_steps)
14 |     current_lr = scheduler.get_lr()[0]
15 |     data = torch.rand(1, 3, 224, 224).cuda()
16 | 
17 |     for i in tqdm(range(warmup_steps * 2)):
18 |         out = model(data)
19 |         out.mean().backward()
20 |         optimizer.step()
21 |         scheduler.step()
22 | 
23 |         if i >= warmup_steps:
24 |             assert scheduler.get_lr()[0] == 0.01
25 |         else:
26 |             assert scheduler.get_lr()[0] > current_lr, f"{scheduler.get_lr()[0]} <= {current_lr}"
27 |             current_lr = scheduler.get_lr()[0]
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     test_lr_scheduler()
32 | 


--------------------------------------------------------------------------------
/tests/test_pos_emb.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from opensora.models.layers.blocks import PositionEmbedding2D, get_2d_sincos_pos_embed
 5 | 
 6 | D = 8
 7 | SCALE = 2.0
 8 | from torch.testing import assert_close
 9 | 
10 | 
11 | def get_spatial_pos_embed(x, hidden_size, h, w, scale, base_size=None):
12 |     pos_embed = get_2d_sincos_pos_embed(
13 |         hidden_size,
14 |         (h, w),
15 |         scale=scale,
16 |         base_size=base_size,
17 |     )
18 |     pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
19 |     return pos_embed.to(device=x.device, dtype=x.dtype)
20 | 
21 | 
22 | @pytest.mark.parametrize("dtype", [torch.float, torch.float16])
23 | @pytest.mark.parametrize("device", ["cpu", "cuda"])
24 | def test_pos_emb(dtype, device):
25 |     # just a placeholder to get the device and dtype
26 |     x = torch.empty(1, dtype=dtype, device=device)
27 |     pos_embedder = PositionEmbedding2D(
28 |         D,
29 |         max_position_embeddings=8,
30 |         scale=SCALE,
31 |     ).to(device=device, dtype=dtype)
32 |     output = pos_embedder(x, 8, 7)
33 |     target = get_spatial_pos_embed(x, D, 8, 7, SCALE)
34 |     assert_close(output, target)
35 |     output = pos_embedder(x, 15, 16)
36 |     target = get_spatial_pos_embed(x, D, 15, 16, SCALE)
37 |     assert_close(output, target)
38 |     output = pos_embedder(x, 30, 20, base_size=2)
39 |     target = get_spatial_pos_embed(x, D, 30, 20, SCALE, base_size=2)
40 |     assert_close(output, target)
41 |     # test cache
42 |     output = pos_embedder(x, 30, 20, base_size=2)
43 |     target = get_spatial_pos_embed(x, D, 30, 20, SCALE, base_size=2)
44 |     assert_close(output, target)
45 |     assert pos_embedder._get_cached_emb.cache_info().hits >= 1
46 | 


--------------------------------------------------------------------------------
/tests/test_t5_shardformer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from copy import deepcopy
 3 | 
 4 | import colossalai
 5 | import torch
 6 | from colossalai.shardformer import ShardConfig, ShardFormer
 7 | from colossalai.testing import spawn
 8 | 
 9 | from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy
10 | from opensora.models.text_encoder.t5 import T5Embedder
11 | 
12 | 
13 | def run_t5_encoder(rank, world_size, port):
14 |     colossalai.launch({}, rank=rank, world_size=world_size, port=port, host="localhost")
15 | 
16 |     # t5 embedder
17 |     t5_path = "./pretrained_models/t5_ckpts"
18 |     hf_t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=t5_path, torch_dtype=torch.float)
19 |     sf_t5 = deepcopy(hf_t5)
20 | 
21 |     # create huggingface model as normal
22 |     shard_config = ShardConfig(
23 |         tensor_parallel_process_group=None,
24 |         pipeline_stage_manager=None,
25 |         enable_tensor_parallelism=False,
26 |         enable_fused_normalization=False,
27 |         enable_flash_attention=False,
28 |         enable_jit_fused=True,
29 |         enable_sequence_parallelism=False,
30 |         enable_sequence_overlap=False,
31 |     )
32 |     shard_former = ShardFormer(shard_config=shard_config)
33 |     sharded_model, _ = shard_former.optimize(sf_t5.model, policy=T5EncoderPolicy())
34 |     sf_t5.model = sharded_model
35 | 
36 |     # test t5 embedder
37 |     texts = ["Who is the best player in the history of NBA?", "How to study computer science?"]
38 |     for i in range(5):
39 |         hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
40 |         sf_embs, sf_masks = sf_t5.get_text_embeddings(texts)
41 | 
42 |     # check accuracy
43 |     assert torch.allclose(hf_embs, sf_embs, rtol=1e-4, atol=1e-5), f"{hf_embs} \nvs\n{sf_embs}"
44 |     assert torch.allclose(hf_masks, sf_masks), f"{hf_masks} \nvs\n{sf_masks}"
45 | 
46 |     # measure perf
47 |     torch.cuda.synchronize()
48 |     hf_start = time.time()
49 |     for i in range(20):
50 |         hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
51 |     torch.cuda.synchronize()
52 |     hf_end = time.time()
53 | 
54 |     # convert sf to fp16
55 |     hf_t5.model = hf_t5.model.half()
56 |     torch.cuda.synchronize()
57 |     sf_start = time.time()
58 |     for i in range(20):
59 |         hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
60 |     torch.cuda.synchronize()
61 |     sf_end = time.time()
62 | 
63 |     print(f"[Performance] native: {hf_end - hf_start}s, shardformer: {sf_end - sf_start} s")
64 | 
65 | 
66 | def test_t5_encoder():
67 |     spawn(run_t5_encoder)
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     test_t5_encoder()
72 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/__init__.py


--------------------------------------------------------------------------------
/tools/caption/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/caption/__init__.py


--------------------------------------------------------------------------------
/tools/caption/acceleration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/caption/acceleration/__init__.py


--------------------------------------------------------------------------------
/tools/caption/acceleration/llava/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/caption/acceleration/llava/__init__.py


--------------------------------------------------------------------------------
/tools/caption/acceleration/llava/policies/__init__.py:
--------------------------------------------------------------------------------
1 | from .llama import LlavaLlamaForCausalLMPolicy
2 | from .mistral import LlavaMistralForCausalLMPolicy
3 | 


--------------------------------------------------------------------------------
/tools/caption/camera_motion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/caption/camera_motion/__init__.py


--------------------------------------------------------------------------------
/tools/caption/camera_motion/detect.py:
--------------------------------------------------------------------------------
 1 | # Originally developed by https://github.com/Vchitect/VBench based on https://github.com/facebookresearch/co-tracker.
 2 | 
 3 | import argparse
 4 | from typing import List
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from .camera_motion import compute_camera_motion
 9 | 
10 | 
11 | def process(paths: List[str], threshold: float) -> List[str]:
12 |     device = "cuda"
13 |     submodules = {"repo": "facebookresearch/co-tracker", "model": "cotracker2"}
14 |     camera_motion_types = compute_camera_motion(device, submodules, paths, factor=threshold)
15 |     return camera_motion_types
16 | 
17 | 
18 | def main(args):
19 |     output_file = args.input.replace(".csv", "_cmotion.csv")
20 |     data = pd.read_csv(args.input)
21 |     data["cmotion"] = process(data["path"], args.threshold)
22 |     data.to_csv(output_file, index=False)
23 |     print(f"Output saved to {output_file}")
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument("input", type=str)
29 |     parser.add_argument("--threshold", type=float, default=0.25)
30 |     args = parser.parse_args()
31 |     main(args)
32 | 


--------------------------------------------------------------------------------
/tools/caption/camera_motion/requirements.txt:
--------------------------------------------------------------------------------
1 | decord
2 | ptvsd
3 | imageio-ffmpeg
4 | 


--------------------------------------------------------------------------------
/tools/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/datasets/__init__.py


--------------------------------------------------------------------------------
/tools/datasets/ffmpeg_check_parallel.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | parallel --progress -j+0 'ffmpeg -v error -i {} -f null - 2>{}.err' < $1
3 | 


--------------------------------------------------------------------------------
/tools/datasets/ffmpeg_filter_without_errors.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from concurrent.futures import ProcessPoolExecutor, as_completed
 4 | import multiprocessing
 5 | from tqdm import tqdm
 6 | 
 7 | def check_error_file(row):
 8 |     # Construct the path to the .err file
 9 |     err_file = f"{row['path']}.err"
10 |     
11 |     # Check if the .err file exists and is empty
12 |     if not os.path.exists(err_file):
13 |         return row
14 |     elif os.path.exists(err_file) and os.path.getsize(err_file) == 0:
15 |         return row
16 |     return None
17 | 
18 | def filter_csv(input_csv):
19 |     # Read the CSV into a pandas DataFrame
20 |     df = pd.read_csv(input_csv)
21 |     
22 |     # Initialize tqdm progress bar
23 |     progress_bar = tqdm(total=len(df), desc="Processing files")
24 |     
25 |     # Use ProcessPoolExecutor for parallel processing
26 |     with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
27 |         # Submit tasks for parallel execution
28 |         futures = {executor.submit(check_error_file, row): index for index, row in df.iterrows()}
29 |         
30 |         # Collect results as they complete
31 |         filtered_rows = []
32 |         for future in as_completed(futures):
33 |             result = future.result()
34 |             if result is not None:
35 |                 filtered_rows.append(result)
36 |             progress_bar.update(1)
37 |     
38 |     # Close the progress bar
39 |     progress_bar.close()
40 |     
41 |     # Create a new DataFrame from the filtered rows
42 |     filtered_df = pd.DataFrame(filtered_rows, columns=df.columns)
43 |     
44 |     # Generate the output file name
45 |     output_csv = input_csv.replace('.csv', '_withouterror.csv')
46 |     
47 |     # Write the filtered DataFrame to a new CSV file
48 |     filtered_df.to_csv(output_csv, index=False)
49 |     print(f"Filtered CSV saved as: {output_csv}")
50 | 
51 | if __name__ == "__main__":
52 |     # Replace 'yourfile.csv' with the actual CSV file you want to process
53 |     import sys
54 |     input_csv = sys.argv[-1]
55 |     filter_csv(input_csv)
56 | 
57 | 


--------------------------------------------------------------------------------
/tools/datasets/filter_large_videos.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | from concurrent.futures import ThreadPoolExecutor
 6 | 
 7 | # Function to get the size of a file
 8 | def get_file_size(path):
 9 |     return os.path.getsize(path) if os.path.isfile(path) else None
10 | 
11 | # Load the CSV file
12 | file_path = sys.argv[-2]
13 | size = int(sys.argv[-1])
14 | assert len(sys.argv) == 3
15 | df = pd.read_csv(file_path)
16 | 
17 | # Enable tqdm to monitor progress
18 | tqdm.pandas()
19 | 
20 | # Use ThreadPoolExecutor for parallel processing
21 | with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your CPU cores
22 |     df['file_size'] = list(tqdm(executor.map(get_file_size, df['path']), total=len(df)))
23 | 
24 | # Convert 50 MB to bytes
25 | size_threshold = size * 1024 * 1024  # 50 MB in bytes
26 | 
27 | # Drop rows where file size is 50 MB or more
28 | df_filtered = df[df['file_size'] < size_threshold]
29 | 
30 | # Save the filtered DataFrame back to the original CSV file
31 | file_path = file_path.replace(".csv", f"_le{size}M.csv")
32 | df_filtered.to_csv(file_path.replace(".csv", f"_le{size}M.csv"), index=False)
33 | print(f"Saved filtered data to {file_path}.")
34 | 
35 | 


--------------------------------------------------------------------------------
/tools/datasets/split.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from typing import List
 3 | 
 4 | import pandas as pd
 5 | from mmengine.config import Config
 6 | 
 7 | from opensora.datasets.bucket import Bucket
 8 | 
 9 | 
10 | def split_by_bucket(
11 |     bucket: Bucket,
12 |     input_files: List[str],
13 |     output_path: str,
14 |     limit: int,
15 |     frame_interval: int,
16 | ):
17 |     print(f"Split {len(input_files)} files into {len(bucket)} buckets")
18 |     total_limit = len(bucket) * limit
19 |     bucket_cnt = {}
20 |     # get all bucket id
21 |     for hw_id, d in bucket.ar_criteria.items():
22 |         for t_id, v in d.items():
23 |             for ar_id in v.keys():
24 |                 bucket_id = (hw_id, t_id, ar_id)
25 |                 bucket_cnt[bucket_id] = 0
26 |     output_df = None
27 |     # split files
28 |     for path in input_files:
29 |         df = pd.read_csv(path)
30 |         if output_df is None:
31 |             output_df = pd.DataFrame(columns=df.columns)
32 |         for i in range(len(df)):
33 |             row = df.iloc[i]
34 |             t, h, w = row["num_frames"], row["height"], row["width"]
35 |             bucket_id = bucket.get_bucket_id(t, h, w, frame_interval)
36 |             if bucket_id is None:
37 |                 continue
38 |             if bucket_cnt[bucket_id] < limit:
39 |                 bucket_cnt[bucket_id] += 1
40 |                 output_df = pd.concat([output_df, pd.DataFrame([row])], ignore_index=True)
41 |                 if len(output_df) >= total_limit:
42 |                     break
43 |         if len(output_df) >= total_limit:
44 |             break
45 |     assert len(output_df) <= total_limit
46 |     if len(output_df) == total_limit:
47 |         print(f"All buckets are full ({total_limit} samples)")
48 |     else:
49 |         print(f"Only {len(output_df)} files are used")
50 |     output_df.to_csv(output_path, index=False)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     parser = argparse.ArgumentParser()
55 |     parser.add_argument("input", type=str, nargs="+")
56 |     parser.add_argument("-o", "--output", required=True)
57 |     parser.add_argument("-c", "--config", required=True)
58 |     parser.add_argument("-l", "--limit", default=200, type=int)
59 |     args = parser.parse_args()
60 |     assert args.limit > 0
61 | 
62 |     cfg = Config.fromfile(args.config)
63 |     bucket_config = cfg.bucket_config
64 |     # rewrite bucket_config
65 |     for ar, d in bucket_config.items():
66 |         for frames, t in d.items():
67 |             p, bs = t
68 |             if p > 0.0:
69 |                 p = 1.0
70 |             d[frames] = (p, bs)
71 |     bucket = Bucket(bucket_config)
72 |     split_by_bucket(bucket, args.input, args.output, args.limit, cfg.dataset.frame_interval)
73 | 


--------------------------------------------------------------------------------
/tools/frame_interpolation/README.md:
--------------------------------------------------------------------------------
 1 | # Frame Interpolation
 2 | 
 3 | For current version, we sample 1 frame out of 3 frames in the video. Although we are going to use VAE to avoid frame loss, we provide a frame interpolation tool to interpolate the video now. The frame interpolation tool is based on [AMT](https://github.com/MCG-NKU/AMT).
 4 | 
 5 | Interpolation can be useful for scenery videos, but it may not be suitable for videos with fast motion.
 6 | 
 7 | ## Requirement
 8 | 
 9 | Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "Frame Interpolation" sections.
10 | 
11 | <!-- ```bash
12 | conda install -c conda-forge opencv
13 | pip install imageio
14 | ``` -->
15 | 
16 | ## Model
17 | 
18 | We use **AMT** as our frame interpolation model. After sampling, you can use frame interpolation model to interpolate your video smoothly.
19 | 
20 | ## Usage
21 | 
22 | The ckpt file will be automatically downloaded in user's `.cache` directory. You can use frame interpolation to your video file or a video folder.
23 | 
24 | 1. Process a video file
25 | 
26 | ```python
27 | python -m tools.frame_interpolation.interpolation your_video.mp4
28 | ```
29 | 
30 | 2. Process all video file in target directory
31 | 
32 | ```python
33 | python -m tools.frame_interpolation.interpolation your_video_dir --output_path samples/interpolation
34 | ```
35 | 
36 | The output video will be stored at `output_path` and its duration time is equal `the total number of frames after frame interpolation / the frame rate`
37 | 
38 | ### Command Line Arguments
39 | 
40 | * `input`: Path of the input video. **Video path** or **Folder path(with --folder)**
41 | * `--ckpt`: Pretrained model of [AMT](https://github.com/MCG-NKU/AMT). Default path: `~/.cache/amt-g.pth`.
42 | * `--niter`: Iterations of interpolation. With $m$ input frames, `[N_ITER]` $=n$ corresponds to $2^n\times (m-1)+1$ output frames.
43 | * `--fps`: Frame rate of the input video. (Default: 8)
44 | * `--output_path`: **Folder Path** of the output video.
45 | 


--------------------------------------------------------------------------------
/tools/frame_interpolation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/frame_interpolation/__init__.py


--------------------------------------------------------------------------------
/tools/frame_interpolation/networks/__init__.py:
--------------------------------------------------------------------------------
1 | from .amt_g import Model
2 | 


--------------------------------------------------------------------------------
/tools/frame_interpolation/networks/blocks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/frame_interpolation/networks/blocks/__init__.py


--------------------------------------------------------------------------------
/tools/frame_interpolation/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/frame_interpolation/utils/__init__.py


--------------------------------------------------------------------------------
/tools/frame_interpolation/utils/dist_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def get_world_size():
 7 |     """Find OMPI world size without calling mpi functions
 8 |     :rtype: int
 9 |     """
10 |     if os.environ.get("PMI_SIZE") is not None:
11 |         return int(os.environ.get("PMI_SIZE") or 1)
12 |     elif os.environ.get("OMPI_COMM_WORLD_SIZE") is not None:
13 |         return int(os.environ.get("OMPI_COMM_WORLD_SIZE") or 1)
14 |     else:
15 |         return torch.cuda.device_count()
16 | 
17 | 
18 | def get_global_rank():
19 |     """Find OMPI world rank without calling mpi functions
20 |     :rtype: int
21 |     """
22 |     if os.environ.get("PMI_RANK") is not None:
23 |         return int(os.environ.get("PMI_RANK") or 0)
24 |     elif os.environ.get("OMPI_COMM_WORLD_RANK") is not None:
25 |         return int(os.environ.get("OMPI_COMM_WORLD_RANK") or 0)
26 |     else:
27 |         return 0
28 | 
29 | 
30 | def get_local_rank():
31 |     """Find OMPI local rank without calling mpi functions
32 |     :rtype: int
33 |     """
34 |     if os.environ.get("MPI_LOCALRANKID") is not None:
35 |         return int(os.environ.get("MPI_LOCALRANKID") or 0)
36 |     elif os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") is not None:
37 |         return int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") or 0)
38 |     else:
39 |         return 0
40 | 
41 | 
42 | def get_master_ip():
43 |     if os.environ.get("AZ_BATCH_MASTER_NODE") is not None:
44 |         return os.environ.get("AZ_BATCH_MASTER_NODE").split(":")[0]
45 |     elif os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") is not None:
46 |         return os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE")
47 |     else:
48 |         return "127.0.0.1"
49 | 


--------------------------------------------------------------------------------
/tools/scene_cut/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scene_cut/__init__.py


--------------------------------------------------------------------------------
/tools/scene_cut/scene_detect.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | from pandarallel import pandarallel
 7 | from scenedetect import AdaptiveDetector, detect
 8 | from tqdm import tqdm
 9 | 
10 | tqdm.pandas()
11 | 
12 | 
13 | def process_single_row(row):
14 |     # windows
15 |     # from scenedetect import detect, ContentDetector, AdaptiveDetector
16 | 
17 |     video_path = row["path"]
18 | 
19 |     detector = AdaptiveDetector(
20 |         adaptive_threshold=3.0,
21 |         # luma_only=True,
22 |     )
23 |     # detector = ContentDetector()
24 |     # TODO: catch error here
25 |     try:
26 |         scene_list = detect(video_path, detector, start_in_scene=True)
27 |         timestamp = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list]
28 |         return True, str(timestamp)
29 |     except Exception as e:
30 |         print(f"Video '{video_path}' with error {e}")
31 |         return False, ""
32 | 
33 | 
34 | def parse_args():
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument("meta_path", type=str)
37 |     parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
38 | 
39 |     args = parser.parse_args()
40 |     return args
41 | 
42 | 
43 | def main():
44 |     args = parse_args()
45 |     meta_path = args.meta_path
46 |     if not os.path.exists(meta_path):
47 |         print(f"Meta file '{meta_path}' not found. Exit.")
48 |         exit()
49 | 
50 |     if args.num_workers is not None:
51 |         pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
52 |     else:
53 |         pandarallel.initialize(progress_bar=True)
54 | 
55 |     meta = pd.read_csv(meta_path)
56 |     ret = meta.parallel_apply(process_single_row, axis=1)
57 | 
58 |     succ, timestamps = list(zip(*ret))
59 |     meta["timestamp"] = timestamps
60 |     meta = meta[np.array(succ)]
61 | 
62 |     wo_ext, ext = os.path.splitext(meta_path)
63 |     out_path = f"{wo_ext}_timestamp{ext}"
64 |     meta.to_csv(out_path, index=False)
65 |     print(f"New meta (shape={meta.shape}) with timestamp saved to '{out_path}'.")
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/tools/scoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scoring/__init__.py


--------------------------------------------------------------------------------
/tools/scoring/aesthetic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scoring/aesthetic/__init__.py


--------------------------------------------------------------------------------
/tools/scoring/matching/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scoring/matching/__init__.py


--------------------------------------------------------------------------------
/tools/scoring/ocr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scoring/ocr/__init__.py


--------------------------------------------------------------------------------
/tools/scoring/ocr/dbnetpp.py:
--------------------------------------------------------------------------------
 1 | model = dict(
 2 |     type="DBNet",
 3 |     backbone=dict(
 4 |         type="CLIPResNet",
 5 |         depth=50,
 6 |         num_stages=4,
 7 |         out_indices=(0, 1, 2, 3),
 8 |         frozen_stages=-1,
 9 |         norm_cfg=dict(type="BN", requires_grad=True),
10 |         norm_eval=False,
11 |         style="pytorch",
12 |         dcn=dict(type="DCNv2", deform_groups=1, fallback_on_stride=False),
13 |         # init_cfg=dict(
14 |         #     type='Pretrained',
15 |         #     checkpoint='https://download.openmmlab.com/mmocr/backbone/resnet50-oclip-7ba0c533.pth'),
16 |         stage_with_dcn=(False, True, True, True),
17 |     ),
18 |     neck=dict(
19 |         type="FPNC",
20 |         in_channels=[256, 512, 1024, 2048],
21 |         lateral_channels=256,
22 |         asf_cfg=dict(attention_type="ScaleChannelSpatial"),
23 |     ),
24 |     det_head=dict(
25 |         type="DBHead",
26 |         in_channels=256,
27 |         module_loss=dict(type="DBModuleLoss"),
28 |         postprocessor=dict(
29 |             type="DBPostprocessor",
30 |             text_repr_type="quad",
31 |             epsilon_ratio=0.002,
32 |         ),
33 |     ),
34 |     data_preprocessor=dict(
35 |         type="TextDetDataPreprocessor",
36 |         mean=[123.675, 116.28, 103.53],
37 |         std=[58.395, 57.12, 57.375],
38 |         bgr_to_rgb=True,
39 |         pad_size_divisor=32,
40 |     ),
41 |     init_cfg=dict(
42 |         type="Pretrained",
43 |         checkpoint="https://download.openmmlab.com/mmocr/textdet/dbnetpp/"
44 |         "dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/"
45 |         "dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth",
46 |     ),
47 | )
48 | 
49 | test_pipeline = [
50 |     # dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
51 |     dict(type="Resize", scale=(4068, 1024), keep_ratio=True),
52 |     dict(
53 |         type="PackTextDetInputs",
54 |         # meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor'),
55 |         meta_keys=("img_shape", "scale_factor"),
56 |     ),
57 | ]
58 | 
59 | # Visualization
60 | vis_backends = [dict(type="LocalVisBackend")]
61 | visualizer = dict(
62 |     type="TextDetLocalVisualizer",
63 |     name="visualizer",
64 |     vis_backends=vis_backends,
65 | )
66 | 


--------------------------------------------------------------------------------
/tools/scoring/optical_flow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scoring/optical_flow/__init__.py


--------------------------------------------------------------------------------
/tools/scoring/optical_flow/unimatch/__init__.py:
--------------------------------------------------------------------------------
1 | from .unimatch import UniMatch
2 | 


--------------------------------------------------------------------------------
/tools/scoring/optical_flow/unimatch/position.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | # https://github.com/facebookresearch/detr/blob/main/models/position_encoding.py
 3 | 
 4 | import math
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | 
10 | class PositionEmbeddingSine(nn.Module):
11 |     """
12 |     This is a more standard version of the position embedding, very similar to the one
13 |     used by the Attention is all you need paper, generalized to work on images.
14 |     """
15 | 
16 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=True, scale=None):
17 |         super().__init__()
18 |         self.num_pos_feats = num_pos_feats
19 |         self.temperature = temperature
20 |         self.normalize = normalize
21 |         if scale is not None and normalize is False:
22 |             raise ValueError("normalize should be True if scale is passed")
23 |         if scale is None:
24 |             scale = 2 * math.pi
25 |         self.scale = scale
26 | 
27 |     def forward(self, x):
28 |         # x = tensor_list.tensors  # [B, C, H, W]
29 |         # mask = tensor_list.mask  # [B, H, W], input with padding, valid as 0
30 |         b, c, h, w = x.size()
31 |         mask = torch.ones((b, h, w), device=x.device)  # [B, H, W]
32 |         y_embed = mask.cumsum(1, dtype=torch.float32)
33 |         x_embed = mask.cumsum(2, dtype=torch.float32)
34 |         if self.normalize:
35 |             eps = 1e-6
36 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
37 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
38 | 
39 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
40 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
41 | 
42 |         pos_x = x_embed[:, :, :, None] / dim_t
43 |         pos_y = y_embed[:, :, :, None] / dim_t
44 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
45 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
46 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
47 |         return pos
48 | 


--------------------------------------------------------------------------------