├── .github └── workflows │ ├── close_issue.yaml │ └── pages-deploy.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── assets ├── demo │ ├── sample_16s_224x448.gif │ ├── sample_16s_320x320.gif │ ├── sample_16x240x426_9.gif │ ├── sample_32x240x426_7.gif │ ├── sample_32x480x854_9.gif │ ├── sora_16x240x426_26.gif │ ├── sora_16x240x426_27.gif │ ├── sora_16x240x426_40.gif │ ├── sora_16x426x240_24.gif │ ├── sora_16x426x240_3.gif │ └── v1.2 │ │ ├── sample_0002.gif │ │ ├── sample_0004.gif │ │ ├── sample_0011.gif │ │ ├── sample_0013.gif │ │ ├── sample_0052.gif │ │ ├── sample_0061.gif │ │ ├── sample_0087.gif │ │ ├── sample_1718.gif │ │ └── sample_1719.gif ├── images │ ├── condition │ │ ├── cactus-happy.png │ │ ├── cactus-sad.png │ │ ├── cliff.png │ │ ├── ship.png │ │ ├── sunset1.png │ │ ├── sunset2.png │ │ └── wave.png │ ├── imagenet │ │ ├── train │ │ │ └── n01440764 │ │ │ │ └── n01440764_10026.JPEG │ │ └── val │ │ │ └── n01440764 │ │ │ └── ILSVRC2012_val_00000293.JPEG │ ├── ocr │ │ ├── demo_text_det.jpg │ │ ├── demo_text_ocr.jpg │ │ └── demo_text_recog.jpg │ └── watermark │ │ └── watermark.png ├── readme │ ├── colossal_ai.png │ ├── gradio_advanced.png │ ├── gradio_basic.png │ ├── gradio_option.png │ ├── icon.png │ ├── llava_vs_pllava_sample.gif │ ├── report-03_actions_count.png │ ├── report-03_objects_count.png │ ├── report-03_video_stats.png │ ├── report_3d_vae.png │ ├── report_arch.jpg │ ├── report_arch_comp.png │ ├── report_bucket.png │ ├── report_caption.png │ ├── report_data_pipeline.png │ ├── report_image_textlen.png │ ├── report_loss_curve_1.png │ ├── report_loss_curve_2.png │ ├── report_loss_curve_3.png │ ├── report_mask.png │ ├── report_mask_config.png │ ├── report_val_loss.png │ ├── report_vbench_score.png │ ├── report_vid_val_loss.png │ ├── report_video_duration.png │ ├── report_video_textlen.png │ ├── sample_0.gif │ ├── sample_1.gif │ ├── sample_2.gif │ ├── sample_3.gif │ ├── sample_4.gif │ ├── sample_5.gif │ └── sequence_parallelism.jpeg └── texts │ ├── VBench │ ├── all_category.txt │ ├── all_dimension.txt │ ├── all_i2v.txt │ ├── prompts_per_category │ │ ├── animal.txt │ │ ├── architecture.txt │ │ ├── food.txt │ │ ├── human.txt │ │ ├── lifestyle.txt │ │ ├── plant.txt │ │ ├── scenery.txt │ │ └── vehicles.txt │ └── prompts_per_dimension │ │ ├── appearance_style.txt │ │ ├── color.txt │ │ ├── human_action.txt │ │ ├── multiple_objects.txt │ │ ├── object_class.txt │ │ ├── overall_consistency.txt │ │ ├── scene.txt │ │ ├── spatial_relationship.txt │ │ ├── subject_consistency.txt │ │ ├── temporal_flickering.txt │ │ └── temporal_style.txt │ ├── imagenet_id.txt │ ├── imagenet_labels.txt │ ├── rand_types.txt │ ├── t2i_samples.txt │ ├── t2i_sigma.txt │ ├── t2v_car.txt │ ├── t2v_latte.txt │ ├── t2v_pllava.txt │ ├── t2v_ref.txt │ ├── t2v_samples.txt │ ├── t2v_short.txt │ ├── t2v_sora.txt │ ├── ucf101_id.txt │ └── ucf101_labels.txt ├── configs ├── dit │ ├── inference │ │ ├── 16x256x256.py │ │ ├── 1x256x256-class.py │ │ └── 1x256x256.py │ └── train │ │ ├── 16x256x256.py │ │ └── 1x256x256.py ├── latte │ ├── inference │ │ ├── 16x256x256-class.py │ │ └── 16x256x256.py │ └── train │ │ └── 16x256x256.py ├── opensora-v1-1 │ ├── inference │ │ ├── sample-ref.py │ │ └── sample.py │ └── train │ │ ├── benchmark.py │ │ ├── image.py │ │ ├── image_rflow.py │ │ ├── stage1.py │ │ ├── stage2.py │ │ ├── stage3.py │ │ └── video.py ├── opensora-v1-2 │ ├── inference │ │ ├── sample.py │ │ └── sample_hf.py │ ├── lambda │ │ ├── stage1.py │ │ ├── stage2.py │ │ ├── stage3.py │ │ ├── stage4.py │ │ ├── stage5.py │ │ └── stage6.py │ ├── misc │ │ ├── bs.py │ │ ├── eval_loss.py │ │ ├── extract.py │ │ └── feat.py │ └── train │ │ ├── adapt.py │ │ ├── demo_360p.py │ │ ├── demo_480p.py │ │ ├── stage1.py │ │ ├── stage1_feat.py │ │ ├── stage2.py │ │ └── stage3.py ├── opensora │ ├── inference │ │ ├── 16x256x256.py │ │ ├── 16x512x512-rflow.py │ │ ├── 16x512x512.py │ │ └── 64x512x512.py │ └── train │ │ ├── 16x256x256-mask.py │ │ ├── 16x256x256-spee-rflow.py │ │ ├── 16x256x256-spee.py │ │ ├── 16x256x256.py │ │ ├── 16x512x512.py │ │ ├── 360x512x512.py │ │ ├── 64x512x512-sp.py │ │ └── 64x512x512.py ├── pixart │ ├── inference │ │ ├── 16x256x256.py │ │ ├── 1x1024MS.py │ │ ├── 1x20481B.py │ │ ├── 1x2048MS.py │ │ ├── 1x256x256.py │ │ ├── 1x512x512-rflow.py │ │ └── 1x512x512.py │ └── train │ │ ├── 16x256x256.py │ │ ├── 1x2048x2048.py │ │ ├── 1x512x512-rflow.py │ │ ├── 1x512x512.py │ │ └── 64x512x512.py └── vae │ ├── inference │ ├── image.py │ └── video.py │ └── train │ ├── stage1.py │ ├── stage2.py │ └── stage3.py ├── docs ├── acceleration.md ├── commands.md ├── config.md ├── data_processing.md ├── datasets.md ├── installation.md ├── report_01.md ├── report_02.md ├── report_03.md ├── structure.md ├── tutorial │ ├── .nojekyll │ ├── Gemfile │ ├── _config.yml │ ├── _data │ │ ├── contact.yml │ │ ├── locales │ │ │ └── en-customized.yml │ │ └── share.yml │ ├── _includes │ │ ├── favicons.html │ │ ├── sidebar.html │ │ └── topbar.html │ ├── _plugins │ │ ├── details_tag.rb │ │ └── posts-lastmod-hook.rb │ ├── _posts │ │ └── .placeholder │ ├── _tabs │ │ ├── dataset.md │ │ ├── introduction.md │ │ ├── lessons.md │ │ ├── repository.md │ │ ├── setup.md │ │ └── training.md │ ├── assets │ │ ├── css │ │ │ ├── colors │ │ │ │ ├── typography-dark.scss │ │ │ │ └── typography-light.scss │ │ │ └── jekyll-theme-chirpy.scss │ │ ├── fails_loss.png │ │ ├── fails_weight_norm.png │ │ ├── img │ │ │ └── lambda-logo.svg │ │ ├── monitoring_tool.png │ │ └── pyspy_dump.png │ └── index.md ├── vae.md └── zh_CN │ ├── README.md │ ├── READMEv1.1.md │ ├── acceleration.md │ ├── commands.md │ ├── datasets.md │ ├── report_v1.md │ ├── report_v2.md │ ├── report_v3.md │ ├── structure.md │ └── vae.md ├── eval ├── README.md ├── human_eval │ ├── generate.sh │ └── launch.sh ├── loss │ ├── eval_loss.py │ ├── launch.sh │ └── tabulate_rl_loss.py ├── sample.sh ├── vae │ ├── cal_flolpips.py │ ├── cal_lpips.py │ ├── cal_psnr.py │ ├── cal_ssim.py │ ├── eval_common_metric.py │ ├── flolpips │ │ ├── correlation │ │ │ └── correlation.py │ │ ├── flolpips.py │ │ ├── pretrained_networks.py │ │ ├── pwcnet.py │ │ └── utils.py │ └── script │ │ └── eval.sh ├── vbench │ ├── VBench_full_info.json │ ├── calc_vbench.py │ ├── launch.sh │ ├── launch_calc.sh │ └── tabulate_vbench_scores.py └── vbench_i2v │ ├── calc_vbench_i2v.py │ ├── json_to_txt.py │ ├── launch.sh │ ├── launch_calc.sh │ ├── tabulate_vbench_i2v_scores.py │ └── vbench2_i2v_full_info.json ├── gradio ├── README.md ├── app.py └── requirements.txt ├── install-check-pytorch23.py ├── install-check.py ├── install-pytorch23.sh ├── install.sh ├── kill_process.sh ├── notebooks ├── inference.ipynb └── launch.ipynb ├── nvtop_all.py ├── opensora ├── __init__.py ├── acceleration │ ├── __init__.py │ ├── checkpoint.py │ ├── communications.py │ ├── parallel_states.py │ ├── plugin.py │ └── shardformer │ │ ├── __init__.py │ │ ├── modeling │ │ ├── __init__.py │ │ └── t5.py │ │ └── policy │ │ ├── __init__.py │ │ └── t5_encoder.py ├── datasets │ ├── __init__.py │ ├── aspect.py │ ├── bucket.py │ ├── dataloader.py │ ├── datasets.py │ ├── read_video.py │ ├── sampler.py │ ├── utils.py │ └── video_transforms.py ├── models │ ├── __init__.py │ ├── dit │ │ ├── __init__.py │ │ └── dit.py │ ├── latte │ │ ├── __init__.py │ │ └── latte.py │ ├── layers │ │ ├── __init__.py │ │ └── blocks.py │ ├── pixart │ │ ├── __init__.py │ │ ├── pixart.py │ │ └── pixart_sigma.py │ ├── stdit │ │ ├── __init__.py │ │ ├── stdit.py │ │ ├── stdit2.py │ │ └── stdit3.py │ ├── text_encoder │ │ ├── __init__.py │ │ ├── classes.py │ │ ├── clip.py │ │ └── t5.py │ └── vae │ │ ├── __init__.py │ │ ├── discriminator.py │ │ ├── losses.py │ │ ├── lpips.py │ │ ├── utils.py │ │ ├── vae.py │ │ ├── vae_temporal.py │ │ └── video_sdxl │ │ └── blocks.py ├── registry.py ├── schedulers │ ├── __init__.py │ ├── dpms │ │ ├── __init__.py │ │ └── dpm_solver.py │ ├── iddpm │ │ ├── __init__.py │ │ ├── diffusion_utils.py │ │ ├── gaussian_diffusion.py │ │ ├── respace.py │ │ ├── speed.py │ │ └── timestep_sampler.py │ └── rf │ │ ├── __init__.py │ │ └── rectified_flow.py └── utils │ ├── __init__.py │ ├── ckpt_utils.py │ ├── config_utils.py │ ├── inference_utils.py │ ├── lr_scheduler.py │ ├── misc.py │ └── train_utils.py ├── requirements ├── requirements-cu121.txt ├── requirements-data.txt ├── requirements-eval.txt ├── requirements-pllava.txt ├── requirements-vae.txt └── requirements.txt ├── scripts ├── clear_cache.sh ├── inference-server.py ├── inference.py ├── inference_vae.py ├── misc │ ├── extract_feat.py │ ├── launch_extract_feat.sh │ ├── launch_search_bs.sh │ ├── profile_train.py │ └── search_bs.py ├── train.py └── train_vae.py ├── setup.py ├── tests ├── test_attn.py ├── test_lr_scheduler.py ├── test_np_torch.py ├── test_pos_emb.py ├── test_seq_parallel_attention.py ├── test_stdit3_sequence_parallelism.py └── test_t5_shardformer.py └── tools ├── __init__.py ├── caption ├── README.md ├── __init__.py ├── acceleration │ ├── __init__.py │ └── llava │ │ ├── __init__.py │ │ └── policies │ │ ├── __init__.py │ │ ├── llama.py │ │ └── mistral.py ├── camera_motion │ ├── __init__.py │ ├── camera_motion.py │ ├── detect.py │ ├── requirements.txt │ ├── utils.py │ └── visualizer.py ├── camera_motion_detect.py ├── caption_gpt4.py ├── caption_llama3.py ├── caption_llava.py ├── pllava_dir │ └── caption_pllava.py └── utils.py ├── datasets ├── README.md ├── __init__.py ├── analyze.py ├── convert.py ├── datautil.py ├── ffmpeg_check_parallel.sh ├── ffmpeg_filter_without_errors.py ├── filter_large_videos.py ├── filter_panda10m.py ├── split.py ├── transform.py └── utils.py ├── frame_interpolation ├── README.md ├── __init__.py ├── interpolation.py ├── networks │ ├── __init__.py │ ├── amt_g.py │ └── blocks │ │ ├── __init__.py │ │ ├── feat_enc.py │ │ ├── ifrnet.py │ │ ├── multi_flow.py │ │ └── raft.py └── utils │ ├── __init__.py │ ├── dist_utils.py │ ├── flow_utils.py │ └── utils.py ├── scene_cut ├── README.md ├── __init__.py ├── convert_id_to_path.py ├── cut.py └── scene_detect.py └── scoring ├── README.md ├── __init__.py ├── aesthetic ├── __init__.py └── inference.py ├── matching ├── __init__.py └── inference.py ├── ocr ├── __init__.py ├── dbnetpp.py └── inference.py └── optical_flow ├── __init__.py ├── inference.py └── unimatch ├── __init__.py ├── attention.py ├── backbone.py ├── geometry.py ├── matching.py ├── position.py ├── reg_refine.py ├── transformer.py ├── trident_conv.py ├── unimatch.py └── utils.py /.github/workflows/close_issue.yaml: -------------------------------------------------------------------------------- 1 | name: Close inactive issues 2 | on: 3 | schedule: 4 | - cron: "30 1 * * *" 5 | 6 | jobs: 7 | close-issues: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | issues: write 11 | pull-requests: write 12 | steps: 13 | - uses: actions/stale@v9 14 | with: 15 | days-before-issue-stale: 7 16 | days-before-issue-close: 7 17 | stale-issue-label: "stale" 18 | stale-issue-message: "This issue is stale because it has been open for 7 days with no activity." 19 | close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale." 20 | days-before-pr-stale: -1 21 | days-before-pr-close: -1 22 | repo-token: ${{ secrets.GITHUB_TOKEN }} 23 | -------------------------------------------------------------------------------- /.github/workflows/pages-deploy.yml: -------------------------------------------------------------------------------- 1 | name: "Deploy Tutorial" 2 | on: 3 | push: 4 | branches: 5 | - main 6 | paths-ignore: 7 | - .gitignore 8 | - README.md 9 | - LICENSE 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | permissions: 15 | contents: read 16 | pages: write 17 | id-token: write 18 | 19 | # Allow one concurrent deployment 20 | concurrency: 21 | group: "pages" 22 | cancel-in-progress: true 23 | 24 | jobs: 25 | build: 26 | runs-on: ubuntu-latest 27 | 28 | steps: 29 | - name: Checkout 30 | uses: actions/checkout@v4 31 | with: 32 | fetch-depth: 0 33 | 34 | - name: Setup Pages 35 | id: pages 36 | uses: actions/configure-pages@v4 37 | 38 | - name: Setup Ruby 39 | uses: ruby/setup-ruby@v1 40 | with: 41 | ruby-version: 3.3 42 | bundler-cache: true 43 | 44 | - name: Install Dependencies 45 | run: | 46 | cd ./docs/tutorial 47 | bundle install 48 | 49 | - name: Build site 50 | run: | 51 | cd ./docs/tutorial 52 | bundle exec jekyll b -d "_site" 53 | env: 54 | JEKYLL_ENV: "production" 55 | 56 | - name: Upload site artifact 57 | uses: actions/upload-pages-artifact@v3 58 | with: 59 | path: "./docs/tutorial/_site" 60 | 61 | deploy: 62 | environment: 63 | name: github-pages 64 | url: ${{ steps.deployment.outputs.page_url }} 65 | runs-on: ubuntu-latest 66 | needs: build 67 | steps: 68 | - name: Deploy to GitHub Pages 69 | id: deployment 70 | uses: actions/deploy-pages@v4 71 | 72 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | line_length = 120 3 | multi_line_output=3 4 | include_trailing_comma = true 5 | ignore_comments = true 6 | profile = black 7 | honor_noqa = true 8 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | 3 | - repo: https://github.com/PyCQA/autoflake 4 | rev: v2.2.1 5 | hooks: 6 | - id: autoflake 7 | name: autoflake (python) 8 | args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports'] 9 | 10 | - repo: https://github.com/pycqa/isort 11 | rev: 5.12.0 12 | hooks: 13 | - id: isort 14 | name: sort all imports (python) 15 | 16 | - repo: https://github.com/psf/black-pre-commit-mirror 17 | rev: 23.9.1 18 | hooks: 19 | - id: black 20 | name: black formatter 21 | args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310'] 22 | 23 | - repo: https://github.com/pre-commit/pre-commit-hooks 24 | rev: v4.3.0 25 | hooks: 26 | - id: check-yaml 27 | - id: check-merge-conflict 28 | - id: check-case-conflict 29 | - id: trailing-whitespace 30 | - id: end-of-file-fixer 31 | - id: mixed-line-ending 32 | args: ['--fix=lf'] 33 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM hpcaitech/pytorch-cuda:2.1.0-12.1.0 2 | 3 | # metainformation 4 | LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/Open-Sora" 5 | LABEL org.opencontainers.image.licenses = "Apache License 2.0" 6 | LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/pytorch-cuda:2.1.0-12.1.0" 7 | 8 | # Set the working directory 9 | WORKDIR /workspace/Open-Sora 10 | # Copy the current directory contents into the container at /workspace/Open-Sora 11 | COPY . . 12 | 13 | # inatall library dependencies 14 | RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y 15 | 16 | # install flash attention 17 | RUN pip install flash-attn --no-build-isolation 18 | 19 | # install apex 20 | RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git 21 | 22 | # install xformers 23 | RUN pip install xformers --index-url https://download.pytorch.org/whl/cu121 24 | 25 | # install this project 26 | RUN pip install -v . 27 | -------------------------------------------------------------------------------- /assets/demo/sample_16s_224x448.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sample_16s_224x448.gif -------------------------------------------------------------------------------- /assets/demo/sample_16s_320x320.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sample_16s_320x320.gif -------------------------------------------------------------------------------- /assets/demo/sample_16x240x426_9.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sample_16x240x426_9.gif -------------------------------------------------------------------------------- /assets/demo/sample_32x240x426_7.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sample_32x240x426_7.gif -------------------------------------------------------------------------------- /assets/demo/sample_32x480x854_9.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sample_32x480x854_9.gif -------------------------------------------------------------------------------- /assets/demo/sora_16x240x426_26.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sora_16x240x426_26.gif -------------------------------------------------------------------------------- /assets/demo/sora_16x240x426_27.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sora_16x240x426_27.gif -------------------------------------------------------------------------------- /assets/demo/sora_16x240x426_40.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sora_16x240x426_40.gif -------------------------------------------------------------------------------- /assets/demo/sora_16x426x240_24.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sora_16x426x240_24.gif -------------------------------------------------------------------------------- /assets/demo/sora_16x426x240_3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/sora_16x426x240_3.gif -------------------------------------------------------------------------------- /assets/demo/v1.2/sample_0002.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0002.gif -------------------------------------------------------------------------------- /assets/demo/v1.2/sample_0004.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0004.gif -------------------------------------------------------------------------------- /assets/demo/v1.2/sample_0011.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0011.gif -------------------------------------------------------------------------------- /assets/demo/v1.2/sample_0013.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0013.gif -------------------------------------------------------------------------------- /assets/demo/v1.2/sample_0052.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0052.gif -------------------------------------------------------------------------------- /assets/demo/v1.2/sample_0061.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0061.gif -------------------------------------------------------------------------------- /assets/demo/v1.2/sample_0087.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_0087.gif -------------------------------------------------------------------------------- /assets/demo/v1.2/sample_1718.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_1718.gif -------------------------------------------------------------------------------- /assets/demo/v1.2/sample_1719.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/demo/v1.2/sample_1719.gif -------------------------------------------------------------------------------- /assets/images/condition/cactus-happy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/cactus-happy.png -------------------------------------------------------------------------------- /assets/images/condition/cactus-sad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/cactus-sad.png -------------------------------------------------------------------------------- /assets/images/condition/cliff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/cliff.png -------------------------------------------------------------------------------- /assets/images/condition/ship.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/ship.png -------------------------------------------------------------------------------- /assets/images/condition/sunset1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/sunset1.png -------------------------------------------------------------------------------- /assets/images/condition/sunset2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/sunset2.png -------------------------------------------------------------------------------- /assets/images/condition/wave.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/condition/wave.png -------------------------------------------------------------------------------- /assets/images/imagenet/train/n01440764/n01440764_10026.JPEG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/imagenet/train/n01440764/n01440764_10026.JPEG -------------------------------------------------------------------------------- /assets/images/imagenet/val/n01440764/ILSVRC2012_val_00000293.JPEG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/imagenet/val/n01440764/ILSVRC2012_val_00000293.JPEG -------------------------------------------------------------------------------- /assets/images/ocr/demo_text_det.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/ocr/demo_text_det.jpg -------------------------------------------------------------------------------- /assets/images/ocr/demo_text_ocr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/ocr/demo_text_ocr.jpg -------------------------------------------------------------------------------- /assets/images/ocr/demo_text_recog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/ocr/demo_text_recog.jpg -------------------------------------------------------------------------------- /assets/images/watermark/watermark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/images/watermark/watermark.png -------------------------------------------------------------------------------- /assets/readme/colossal_ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/colossal_ai.png -------------------------------------------------------------------------------- /assets/readme/gradio_advanced.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/gradio_advanced.png -------------------------------------------------------------------------------- /assets/readme/gradio_basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/gradio_basic.png -------------------------------------------------------------------------------- /assets/readme/gradio_option.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/gradio_option.png -------------------------------------------------------------------------------- /assets/readme/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/icon.png -------------------------------------------------------------------------------- /assets/readme/llava_vs_pllava_sample.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/llava_vs_pllava_sample.gif -------------------------------------------------------------------------------- /assets/readme/report-03_actions_count.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report-03_actions_count.png -------------------------------------------------------------------------------- /assets/readme/report-03_objects_count.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report-03_objects_count.png -------------------------------------------------------------------------------- /assets/readme/report-03_video_stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report-03_video_stats.png -------------------------------------------------------------------------------- /assets/readme/report_3d_vae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_3d_vae.png -------------------------------------------------------------------------------- /assets/readme/report_arch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_arch.jpg -------------------------------------------------------------------------------- /assets/readme/report_arch_comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_arch_comp.png -------------------------------------------------------------------------------- /assets/readme/report_bucket.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_bucket.png -------------------------------------------------------------------------------- /assets/readme/report_caption.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_caption.png -------------------------------------------------------------------------------- /assets/readme/report_data_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_data_pipeline.png -------------------------------------------------------------------------------- /assets/readme/report_image_textlen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_image_textlen.png -------------------------------------------------------------------------------- /assets/readme/report_loss_curve_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_loss_curve_1.png -------------------------------------------------------------------------------- /assets/readme/report_loss_curve_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_loss_curve_2.png -------------------------------------------------------------------------------- /assets/readme/report_loss_curve_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_loss_curve_3.png -------------------------------------------------------------------------------- /assets/readme/report_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_mask.png -------------------------------------------------------------------------------- /assets/readme/report_mask_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_mask_config.png -------------------------------------------------------------------------------- /assets/readme/report_val_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_val_loss.png -------------------------------------------------------------------------------- /assets/readme/report_vbench_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_vbench_score.png -------------------------------------------------------------------------------- /assets/readme/report_vid_val_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_vid_val_loss.png -------------------------------------------------------------------------------- /assets/readme/report_video_duration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_video_duration.png -------------------------------------------------------------------------------- /assets/readme/report_video_textlen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/report_video_textlen.png -------------------------------------------------------------------------------- /assets/readme/sample_0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_0.gif -------------------------------------------------------------------------------- /assets/readme/sample_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_1.gif -------------------------------------------------------------------------------- /assets/readme/sample_2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_2.gif -------------------------------------------------------------------------------- /assets/readme/sample_3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_3.gif -------------------------------------------------------------------------------- /assets/readme/sample_4.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_4.gif -------------------------------------------------------------------------------- /assets/readme/sample_5.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sample_5.gif -------------------------------------------------------------------------------- /assets/readme/sequence_parallelism.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/assets/readme/sequence_parallelism.jpeg -------------------------------------------------------------------------------- /assets/texts/VBench/prompts_per_dimension/color.txt: -------------------------------------------------------------------------------- 1 | a red bicycle 2 | a green bicycle 3 | a blue bicycle 4 | a yellow bicycle 5 | an orange bicycle 6 | a purple bicycle 7 | a pink bicycle 8 | a black bicycle 9 | a white bicycle 10 | a red car 11 | a green car 12 | a blue car 13 | a yellow car 14 | an orange car 15 | a purple car 16 | a pink car 17 | a black car 18 | a white car 19 | a red bird 20 | a green bird 21 | a blue bird 22 | a yellow bird 23 | an orange bird 24 | a purple bird 25 | a pink bird 26 | a black bird 27 | a white bird 28 | a black cat 29 | a white cat 30 | an orange cat 31 | a yellow cat 32 | a red umbrella 33 | a green umbrella 34 | a blue umbrella 35 | a yellow umbrella 36 | an orange umbrella 37 | a purple umbrella 38 | a pink umbrella 39 | a black umbrella 40 | a white umbrella 41 | a red suitcase 42 | a green suitcase 43 | a blue suitcase 44 | a yellow suitcase 45 | an orange suitcase 46 | a purple suitcase 47 | a pink suitcase 48 | a black suitcase 49 | a white suitcase 50 | a red bowl 51 | a green bowl 52 | a blue bowl 53 | a yellow bowl 54 | an orange bowl 55 | a purple bowl 56 | a pink bowl 57 | a black bowl 58 | a white bowl 59 | a red chair 60 | a green chair 61 | a blue chair 62 | a yellow chair 63 | an orange chair 64 | a purple chair 65 | a pink chair 66 | a black chair 67 | a white chair 68 | a red clock 69 | a green clock 70 | a blue clock 71 | a yellow clock 72 | an orange clock 73 | a purple clock 74 | a pink clock 75 | a black clock 76 | a white clock 77 | a red vase 78 | a green vase 79 | a blue vase 80 | a yellow vase 81 | an orange vase 82 | a purple vase 83 | a pink vase 84 | a black vase 85 | a white vase 86 | -------------------------------------------------------------------------------- /assets/texts/VBench/prompts_per_dimension/multiple_objects.txt: -------------------------------------------------------------------------------- 1 | a bird and a cat 2 | a cat and a dog 3 | a dog and a horse 4 | a horse and a sheep 5 | a sheep and a cow 6 | a cow and an elephant 7 | an elephant and a bear 8 | a bear and a zebra 9 | a zebra and a giraffe 10 | a giraffe and a bird 11 | a chair and a couch 12 | a couch and a potted plant 13 | a potted plant and a tv 14 | a tv and a laptop 15 | a laptop and a remote 16 | a remote and a keyboard 17 | a keyboard and a cell phone 18 | a cell phone and a book 19 | a book and a clock 20 | a clock and a backpack 21 | a backpack and an umbrella 22 | an umbrella and a handbag 23 | a handbag and a tie 24 | a tie and a suitcase 25 | a suitcase and a vase 26 | a vase and scissors 27 | scissors and a teddy bear 28 | a teddy bear and a frisbee 29 | a frisbee and skis 30 | skis and a snowboard 31 | a snowboard and a sports ball 32 | a sports ball and a kite 33 | a kite and a baseball bat 34 | a baseball bat and a baseball glove 35 | a baseball glove and a skateboard 36 | a skateboard and a surfboard 37 | a surfboard and a tennis racket 38 | a tennis racket and a bottle 39 | a bottle and a chair 40 | an airplane and a train 41 | a train and a boat 42 | a boat and an airplane 43 | a bicycle and a car 44 | a car and a motorcycle 45 | a motorcycle and a bus 46 | a bus and a traffic light 47 | a traffic light and a fire hydrant 48 | a fire hydrant and a stop sign 49 | a stop sign and a parking meter 50 | a parking meter and a truck 51 | a truck and a bicycle 52 | a toilet and a hair drier 53 | a hair drier and a toothbrush 54 | a toothbrush and a sink 55 | a sink and a toilet 56 | a wine glass and a chair 57 | a cup and a couch 58 | a fork and a potted plant 59 | a knife and a tv 60 | a spoon and a laptop 61 | a bowl and a remote 62 | a banana and a keyboard 63 | an apple and a cell phone 64 | a sandwich and a book 65 | an orange and a clock 66 | broccoli and a backpack 67 | a carrot and an umbrella 68 | a hot dog and a handbag 69 | a pizza and a tie 70 | a donut and a suitcase 71 | a cake and a vase 72 | an oven and scissors 73 | a toaster and a teddy bear 74 | a microwave and a frisbee 75 | a refrigerator and skis 76 | a bicycle and an airplane 77 | a car and a train 78 | a motorcycle and a boat 79 | a person and a toilet 80 | a person and a hair drier 81 | a person and a toothbrush 82 | a person and a sink 83 | -------------------------------------------------------------------------------- /assets/texts/VBench/prompts_per_dimension/object_class.txt: -------------------------------------------------------------------------------- 1 | a person 2 | a bicycle 3 | a car 4 | a motorcycle 5 | an airplane 6 | a bus 7 | a train 8 | a truck 9 | a boat 10 | a traffic light 11 | a fire hydrant 12 | a stop sign 13 | a parking meter 14 | a bench 15 | a bird 16 | a cat 17 | a dog 18 | a horse 19 | a sheep 20 | a cow 21 | an elephant 22 | a bear 23 | a zebra 24 | a giraffe 25 | a backpack 26 | an umbrella 27 | a handbag 28 | a tie 29 | a suitcase 30 | a frisbee 31 | skis 32 | a snowboard 33 | a sports ball 34 | a kite 35 | a baseball bat 36 | a baseball glove 37 | a skateboard 38 | a surfboard 39 | a tennis racket 40 | a bottle 41 | a wine glass 42 | a cup 43 | a fork 44 | a knife 45 | a spoon 46 | a bowl 47 | a banana 48 | an apple 49 | a sandwich 50 | an orange 51 | broccoli 52 | a carrot 53 | a hot dog 54 | a pizza 55 | a donut 56 | a cake 57 | a chair 58 | a couch 59 | a potted plant 60 | a bed 61 | a dining table 62 | a toilet 63 | a tv 64 | a laptop 65 | a remote 66 | a keyboard 67 | a cell phone 68 | a microwave 69 | an oven 70 | a toaster 71 | a sink 72 | a refrigerator 73 | a book 74 | a clock 75 | a vase 76 | scissors 77 | a teddy bear 78 | a hair drier 79 | a toothbrush 80 | -------------------------------------------------------------------------------- /assets/texts/VBench/prompts_per_dimension/scene.txt: -------------------------------------------------------------------------------- 1 | alley 2 | amusement park 3 | aquarium 4 | arch 5 | art gallery 6 | bathroom 7 | bakery shop 8 | ballroom 9 | bar 10 | barn 11 | basement 12 | beach 13 | bedroom 14 | bridge 15 | botanical garden 16 | cafeteria 17 | campsite 18 | campus 19 | carrousel 20 | castle 21 | cemetery 22 | classroom 23 | cliff 24 | crosswalk 25 | construction site 26 | corridor 27 | courtyard 28 | desert 29 | downtown 30 | driveway 31 | farm 32 | food court 33 | football field 34 | forest road 35 | fountain 36 | gas station 37 | glacier 38 | golf course 39 | indoor gymnasium 40 | harbor 41 | highway 42 | hospital 43 | house 44 | iceberg 45 | industrial area 46 | jail cell 47 | junkyard 48 | kitchen 49 | indoor library 50 | lighthouse 51 | laboratory 52 | mansion 53 | marsh 54 | mountain 55 | indoor movie theater 56 | indoor museum 57 | music studio 58 | nursery 59 | ocean 60 | office 61 | palace 62 | parking lot 63 | pharmacy 64 | phone booth 65 | raceway 66 | restaurant 67 | river 68 | science museum 69 | shower 70 | ski slope 71 | sky 72 | skyscraper 73 | baseball stadium 74 | staircase 75 | street 76 | supermarket 77 | indoor swimming pool 78 | tower 79 | outdoor track 80 | train railway 81 | train station platform 82 | underwater coral reef 83 | valley 84 | volcano 85 | waterfall 86 | windmill 87 | -------------------------------------------------------------------------------- /assets/texts/imagenet_id.txt: -------------------------------------------------------------------------------- 1 | 207 2 | 360 3 | 387 4 | 974 5 | 88 6 | 979 7 | 417 8 | 279 9 | -------------------------------------------------------------------------------- /assets/texts/imagenet_labels.txt: -------------------------------------------------------------------------------- 1 | golden retriever 2 | otter 3 | lesser panda 4 | geyser 5 | macaw 6 | valley 7 | balloon 8 | golden panda 9 | -------------------------------------------------------------------------------- /assets/texts/rand_types.txt: -------------------------------------------------------------------------------- 1 | 随机电影镜头 2 | 随机电影镜头 3 | 随机电影镜头 4 | 随机电影镜头 5 | 随机电影镜头 6 | 随机任务镜头 7 | 随机任务镜头 8 | 随机任务镜头 9 | 随机任务镜头 10 | 随机任务镜头 11 | 随机游戏镜头 12 | 随机游戏镜头 13 | 随机游戏镜头 14 | 随机游戏镜头 15 | 随机游戏镜头 16 | 随机开车镜头 17 | 随机开车镜头 18 | 随机开车镜头 19 | 随机开车镜头 20 | 随机开车镜头 21 | 随机动物镜头 22 | 随机动物镜头 23 | 随机动物镜头 24 | 随机动物镜头 25 | 随机动物镜头 26 | 随机森林镜头 27 | 随机森林镜头 28 | 随机森林镜头 29 | 随机森林镜头 30 | 随机森林镜头 31 | 随机动漫镜头 32 | 随机动漫镜头 33 | 随机动漫镜头 34 | 随机动漫镜头 35 | 随机动漫镜头 36 | 随机舞蹈镜头 37 | 随机舞蹈镜头 38 | 随机舞蹈镜头 39 | 随机舞蹈镜头 40 | 随机舞蹈镜头 41 | -------------------------------------------------------------------------------- /assets/texts/t2i_samples.txt: -------------------------------------------------------------------------------- 1 | A small cactus with a happy face in the Sahara desert. 2 | Bright scene, aerial view,ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens. 3 | Nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph. 4 | Poster of a mechanical cat, techical Schematics viewed from front. 5 | Luffy from ONEPIECE, handsome face, fantasy. 6 | Real beautiful woman. 7 | A alpaca made of colorful building blocks, cyberpunk. 8 | artistic 9 | -------------------------------------------------------------------------------- /assets/texts/t2i_sigma.txt: -------------------------------------------------------------------------------- 1 | Eiffel Tower was Made up of more than 2 million translucent straws to look like a cloud, with the bell tower at the top of the building, Michel installed huge foam-making machines in the forest to blow huge amounts of unpredictable wet clouds in the building's classic architecture. 2 | A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures. 3 | Full body shot, a French woman, Photography, French Streets background, backlighting, rim light, Fujifilm. 4 | Close-up photos of models, hazy light and shadow, laser metal hair accessories, soft and beautiful, light gold pupils, white eyelashes, low saturation, real skin details, clear pores and fine lines, light reflection and refraction, ultra-clear, cinematography, award-winning works. 5 | A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in. 6 | Lego model, future rocket station, intricate details, high resolution, unreal engine, UHD 7 | One giant, sharp, metal square mirror in the center of the frame, four young people on the foreground, background sunny palm oil planation, tropical, realistic style, photography, nostalgic, green tone, mysterious, dreamy, bright color. 8 | Modern luxury contemporary luxury home interiors house, in the style of mimicking ruined materials, ray tracing, haunting houses, and stone, capture the essence of nature, gray and bronze, dynamic outdoor shots. 9 | Over the shoulder game perspective, game screen of Diablo 4, Inside the gorgeous palace is the wet ground, The necromancer knelt before the king, and a horde of skeletons he summoned stood at his side, cinematic light. 10 | A curvy timber house near a sea, designed by Zaha Hadid, represent the image of a cold, modern architecture, at night, white lighting, highly detailed. 11 | -------------------------------------------------------------------------------- /assets/texts/t2v_car.txt: -------------------------------------------------------------------------------- 1 | |0|A car driving on the in forest.|2|A car driving in the desert.|4|A car driving near the coast.|6|A car driving in the city.|8|A car driving near a mountain.|10|A car driving on the surface of a river.|12|A car driving on the surface of the earch.|14|A car driving in the universe.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,16,0.4"} 2 | -------------------------------------------------------------------------------- /assets/texts/t2v_latte.txt: -------------------------------------------------------------------------------- 1 | Yellow and black tropical fish dart through the sea. 2 | An epic tornado attacking above aglowing city at night. 3 | Slow pan upward of blazing oak fire in an indoor fireplace. 4 | a cat wearing sunglasses and working as a lifeguard at pool. 5 | Sunset over the sea. 6 | A dog in astronaut suit and sunglasses floating in space. 7 | A astronaut in flying in space, 4k, high resolution 8 | -------------------------------------------------------------------------------- /assets/texts/t2v_ref.txt: -------------------------------------------------------------------------------- 1 | Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway. 2 | In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave. 3 | Pirate ship in a cosmic maelstrom nebula. 4 | Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway. 5 | A sad small cactus with in the Sahara desert becomes happy. 6 | A car driving on a road in the middle of a desert. 7 | -------------------------------------------------------------------------------- /assets/texts/t2v_short.txt: -------------------------------------------------------------------------------- 1 | A fat rabbit wearing a purple robe walking through a fantasy landscape 2 | Waves crashing against a lone lighthouse, ominous lighting 3 | A mystical forest showcasing the adventures of travelers who enter 4 | A blue-haired mage singing 5 | A surreal landscape with floating islands and waterfalls in the sky craft 6 | A blue bird standing in water 7 | A young man walks alone by the seaside 8 | Pink rose on a glass surface with droplets, close-up 9 | Drove viewpoint, a subway train coming out of a tunnel 10 | Space with all planets green and pink color with background of bright white stars 11 | A city floating in an astral space, with stars and nebulae 12 | Sunrise on top of a high-rise building 13 | Pink and cyan powder explosions 14 | Deers in the woods gaze into the camera under the sunlight 15 | In a flash of lightning, a wizard appeared from thin air, his long robes billowing in the wind 16 | A futuristic cyberpunk cityscape at night with towering neon-lit skyscrapers 17 | A scene where the trees, flowers, and animals come together to create a symphony of nature 18 | A ghostly ship sailing through the clouds, navigating through a sea under a moonlit sky 19 | A sunset with beautiful beach 20 | A young man walking alone in the forest 21 | -------------------------------------------------------------------------------- /assets/texts/ucf101_id.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | -------------------------------------------------------------------------------- /assets/texts/ucf101_labels.txt: -------------------------------------------------------------------------------- 1 | Apply Eye Makeup 2 | Apply Lipstick 3 | Archery 4 | Baby Crawling 5 | Balance Beam 6 | Band Marching 7 | -------------------------------------------------------------------------------- /configs/dit/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="DiT-XL/2", 8 | condition="text", 9 | from_pretrained="PRETRAINED_MODEL", 10 | ) 11 | vae = dict( 12 | type="VideoAutoencoderKL", 13 | from_pretrained="stabilityai/sd-vae-ft-ema", 14 | ) 15 | text_encoder = dict( 16 | type="clip", 17 | from_pretrained="openai/clip-vit-base-patch32", 18 | model_max_length=77, 19 | ) 20 | scheduler = dict( 21 | type="dpm-solver", 22 | num_sampling_steps=20, 23 | cfg_scale=4.0, 24 | ) 25 | dtype = "bf16" 26 | 27 | # Others 28 | batch_size = 2 29 | seed = 42 30 | prompt_path = "./assets/texts/ucf101_labels.txt" 31 | save_dir = "./samples/samples/" 32 | -------------------------------------------------------------------------------- /configs/dit/inference/1x256x256-class.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="DiT-XL/2", 8 | no_temporal_pos_emb=True, 9 | condition="label_1000", 10 | from_pretrained="DiT-XL-2-256x256.pt", 11 | ) 12 | vae = dict( 13 | type="VideoAutoencoderKL", 14 | from_pretrained="stabilityai/sd-vae-ft-ema", 15 | ) 16 | text_encoder = dict( 17 | type="classes", 18 | num_classes=1000, 19 | ) 20 | scheduler = dict( 21 | type="dpm-solver", 22 | num_sampling_steps=20, 23 | cfg_scale=4.0, 24 | ) 25 | dtype = "bf16" 26 | 27 | # Others 28 | batch_size = 2 29 | seed = 42 30 | prompt_path = "./assets/texts/imagenet_id.txt" 31 | save_dir = "./samples/samples/" 32 | -------------------------------------------------------------------------------- /configs/dit/inference/1x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="DiT-XL/2", 8 | no_temporal_pos_emb=True, 9 | condition="text", 10 | from_pretrained="PRETRAINED_MODEL", 11 | ) 12 | vae = dict( 13 | type="VideoAutoencoderKL", 14 | from_pretrained="stabilityai/sd-vae-ft-ema", 15 | ) 16 | text_encoder = dict( 17 | type="clip", 18 | from_pretrained="openai/clip-vit-base-patch32", 19 | model_max_length=77, 20 | ) 21 | scheduler = dict( 22 | type="dpm-solver", 23 | num_sampling_steps=20, 24 | cfg_scale=4.0, 25 | ) 26 | dtype = "bf16" 27 | 28 | # Others 29 | batch_size = 2 30 | seed = 42 31 | prompt_path = "./assets/texts/imagenet_labels.txt" 32 | save_dir = "./samples/samples/" 33 | -------------------------------------------------------------------------------- /configs/dit/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="DiT-XL/2", 20 | from_pretrained="DiT-XL-2-256x256.pt", 21 | enable_flash_attn=True, 22 | enable_layernorm_kernel=True, 23 | ) 24 | vae = dict( 25 | type="VideoAutoencoderKL", 26 | from_pretrained="stabilityai/sd-vae-ft-ema", 27 | ) 28 | text_encoder = dict( 29 | type="clip", 30 | from_pretrained="openai/clip-vit-base-patch32", 31 | model_max_length=77, 32 | ) 33 | scheduler = dict( 34 | type="iddpm", 35 | timestep_respacing="", 36 | ) 37 | 38 | # Others 39 | seed = 42 40 | outputs = "outputs" 41 | wandb = False 42 | 43 | epochs = 1000 44 | log_every = 10 45 | ckpt_every = 1000 46 | load = None 47 | 48 | batch_size = 8 49 | lr = 2e-5 50 | grad_clip = 1.0 51 | -------------------------------------------------------------------------------- /configs/dit/train/1x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=1, 6 | frame_interval=1, 7 | image_size=(256, 256), 8 | transform_name="center", 9 | ) 10 | 11 | # Define acceleration 12 | num_workers = 4 13 | dtype = "bf16" 14 | grad_checkpoint = False 15 | plugin = "zero2" 16 | sp_size = 1 17 | 18 | # Define model 19 | model = dict( 20 | type="DiT-XL/2", 21 | no_temporal_pos_emb=True, 22 | enable_flash_attn=True, 23 | enable_layernorm_kernel=True, 24 | ) 25 | vae = dict( 26 | type="VideoAutoencoderKL", 27 | from_pretrained="stabilityai/sd-vae-ft-ema", 28 | ) 29 | text_encoder = dict( 30 | type="clip", 31 | from_pretrained="openai/clip-vit-base-patch32", 32 | model_max_length=77, 33 | ) 34 | scheduler = dict( 35 | type="iddpm", 36 | timestep_respacing="", 37 | ) 38 | 39 | # Others 40 | seed = 42 41 | outputs = "outputs" 42 | wandb = False 43 | 44 | epochs = 1000 45 | log_every = 10 46 | ckpt_every = 1000 47 | load = None 48 | 49 | batch_size = 128 50 | lr = 1e-4 # according to DiT repo 51 | grad_clip = 1.0 52 | -------------------------------------------------------------------------------- /configs/latte/inference/16x256x256-class.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="Latte-XL/2", 8 | condition="label_101", 9 | from_pretrained="Latte-XL-2-256x256-ucf101.pt", 10 | ) 11 | vae = dict( 12 | type="VideoAutoencoderKL", 13 | from_pretrained="stabilityai/sd-vae-ft-ema", 14 | ) 15 | text_encoder = dict( 16 | type="classes", 17 | num_classes=101, 18 | ) 19 | scheduler = dict( 20 | type="dpm-solver", 21 | num_sampling_steps=20, 22 | cfg_scale=4.0, 23 | ) 24 | dtype = "bf16" 25 | 26 | # Others 27 | batch_size = 2 28 | seed = 42 29 | prompt_path = "./assets/texts/ucf101_id.txt" 30 | save_dir = "./samples/samples/" 31 | -------------------------------------------------------------------------------- /configs/latte/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="Latte-XL/2", 8 | condition="text", 9 | from_pretrained="PRETRAINED_MODEL", 10 | ) 11 | vae = dict( 12 | type="VideoAutoencoderKL", 13 | from_pretrained="stabilityai/sd-vae-ft-ema", 14 | ) 15 | text_encoder = dict( 16 | type="clip", 17 | from_pretrained="openai/clip-vit-base-patch32", 18 | model_max_length=77, 19 | ) 20 | scheduler = dict( 21 | type="dpm-solver", 22 | num_sampling_steps=20, 23 | cfg_scale=4.0, 24 | ) 25 | dtype = "bf16" 26 | 27 | # Others 28 | batch_size = 2 29 | seed = 42 30 | prompt_path = "./assets/texts/ucf101_labels.txt" 31 | save_dir = "./samples/samples/" 32 | -------------------------------------------------------------------------------- /configs/latte/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="Latte-XL/2", 20 | enable_flash_attn=True, 21 | enable_layernorm_kernel=True, 22 | ) 23 | vae = dict( 24 | type="VideoAutoencoderKL", 25 | from_pretrained="stabilityai/sd-vae-ft-ema", 26 | ) 27 | text_encoder = dict( 28 | type="clip", 29 | from_pretrained="openai/clip-vit-base-patch32", 30 | model_max_length=77, 31 | ) 32 | scheduler = dict( 33 | type="iddpm", 34 | timestep_respacing="", 35 | ) 36 | 37 | # Others 38 | seed = 42 39 | outputs = "outputs" 40 | wandb = False 41 | 42 | epochs = 1000 43 | log_every = 10 44 | ckpt_every = 1000 45 | load = None 46 | 47 | batch_size = 8 48 | lr = 2e-5 49 | grad_clip = 1.0 50 | -------------------------------------------------------------------------------- /configs/opensora-v1-1/inference/sample.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | frame_interval = 3 3 | fps = 24 4 | image_size = (240, 426) 5 | multi_resolution = "STDiT2" 6 | 7 | # Define model 8 | model = dict( 9 | type="STDiT2-XL/2", 10 | from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3", 11 | input_sq_size=512, 12 | qk_norm=True, 13 | qk_norm_legacy=True, 14 | enable_flash_attn=True, 15 | enable_layernorm_kernel=True, 16 | ) 17 | vae = dict( 18 | type="VideoAutoencoderKL", 19 | from_pretrained="stabilityai/sd-vae-ft-ema", 20 | cache_dir=None, # "/mnt/hdd/cached_models", 21 | micro_batch_size=4, 22 | ) 23 | text_encoder = dict( 24 | type="t5", 25 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 26 | cache_dir=None, # "/mnt/hdd/cached_models", 27 | model_max_length=200, 28 | ) 29 | scheduler = dict( 30 | type="iddpm", 31 | num_sampling_steps=100, 32 | cfg_scale=7.0, 33 | cfg_channel=3, # or None 34 | ) 35 | dtype = "bf16" 36 | 37 | # Condition 38 | prompt_path = "./assets/texts/t2v_samples.txt" 39 | prompt = None # prompt has higher priority than prompt_path 40 | 41 | # Others 42 | batch_size = 1 43 | seed = 42 44 | save_dir = "./samples/samples/" 45 | -------------------------------------------------------------------------------- /configs/opensora-v1-1/train/image.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | data_path=None, 5 | num_frames=None, 6 | frame_interval=3, 7 | image_size=(None, None), 8 | transform_name="resize_crop", 9 | ) 10 | bucket_config = { # 6s/it 11 | "256": {1: (1.0, 256)}, 12 | "512": {1: (1.0, 80)}, 13 | "480p": {1: (1.0, 52)}, 14 | "1024": {1: (1.0, 20)}, 15 | "1080p": {1: (1.0, 8)}, 16 | } 17 | 18 | # Define acceleration 19 | num_workers = 4 20 | num_bucket_build_workers = 16 21 | dtype = "bf16" 22 | grad_checkpoint = True 23 | plugin = "zero2" 24 | sp_size = 1 25 | 26 | # Define model 27 | model = dict( 28 | type="STDiT2-XL/2", 29 | from_pretrained=None, 30 | input_sq_size=512, # pretrained model is trained on 512x512 31 | qk_norm=True, 32 | qk_norm_legacy=True, 33 | enable_flash_attn=True, 34 | enable_layernorm_kernel=True, 35 | ) 36 | vae = dict( 37 | type="VideoAutoencoderKL", 38 | from_pretrained="stabilityai/sd-vae-ft-ema", 39 | micro_batch_size=4, 40 | local_files_only=True, 41 | ) 42 | text_encoder = dict( 43 | type="t5", 44 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 45 | model_max_length=200, 46 | shardformer=True, 47 | local_files_only=True, 48 | ) 49 | scheduler = dict( 50 | type="iddpm", 51 | timestep_respacing="", 52 | ) 53 | 54 | # Others 55 | seed = 42 56 | outputs = "outputs" 57 | wandb = False 58 | 59 | epochs = 1000 60 | log_every = 10 61 | ckpt_every = 500 62 | load = None 63 | 64 | batch_size = 10 # only for logging 65 | lr = 2e-5 66 | grad_clip = 1.0 67 | -------------------------------------------------------------------------------- /configs/opensora-v1-1/train/image_rflow.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | # dataset = dict( 3 | # type="VariableVideoTextDataset", 4 | # data_path=None, 5 | # num_frames=None, 6 | # frame_interval=3, 7 | # image_size=(None, None), 8 | # transform_name="resize_crop", 9 | # ) 10 | dataset = dict( 11 | type="VideoTextDataset", 12 | data_path=None, 13 | num_frames=1, 14 | frame_interval=1, 15 | image_size=(256, 256), 16 | transform_name="center", 17 | ) 18 | bucket_config = { # 6s/it 19 | "256": {1: (1.0, 256)}, 20 | "512": {1: (1.0, 80)}, 21 | "480p": {1: (1.0, 52)}, 22 | "1024": {1: (1.0, 20)}, 23 | "1080p": {1: (1.0, 8)}, 24 | } 25 | 26 | # Define acceleration 27 | num_workers = 16 28 | dtype = "bf16" 29 | grad_checkpoint = True 30 | plugin = "zero2" 31 | sp_size = 1 32 | 33 | # Define model 34 | # model = dict( 35 | # type="DiT-XL/2", 36 | # from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth", 37 | # # input_sq_size=512, # pretrained model is trained on 512x512 38 | # enable_flash_attn=True, 39 | # enable_layernorm_kernel=True, 40 | # ) 41 | model = dict( 42 | type="PixArt-XL/2", 43 | space_scale=1.0, 44 | time_scale=1.0, 45 | no_temporal_pos_emb=True, 46 | from_pretrained="PixArt-XL-2-512x512.pth", 47 | enable_flash_attn=True, 48 | enable_layernorm_kernel=True, 49 | ) 50 | # model = dict( 51 | # type="DiT-XL/2", 52 | # # space_scale=1.0, 53 | # # time_scale=1.0, 54 | # no_temporal_pos_emb=True, 55 | # # from_pretrained="PixArt-XL-2-512x512.pth", 56 | # from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth", 57 | # enable_flash_attn=True, 58 | # enable_layernorm_kernel=True, 59 | # ) 60 | vae = dict( 61 | type="VideoAutoencoderKL", 62 | from_pretrained="stabilityai/sd-vae-ft-ema", 63 | micro_batch_size=4, 64 | ) 65 | text_encoder = dict( 66 | type="t5", 67 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 68 | model_max_length=200, 69 | shardformer=True, 70 | ) 71 | scheduler = dict( 72 | type="rflow", 73 | # timestep_respacing="", 74 | ) 75 | 76 | # Others 77 | seed = 42 78 | outputs = "outputs" 79 | wandb = False 80 | 81 | epochs = 10 82 | log_every = 10 83 | ckpt_every = 500 84 | load = None 85 | 86 | batch_size = 100 # only for logging 87 | lr = 2e-5 88 | grad_clip = 1.0 89 | -------------------------------------------------------------------------------- /configs/opensora-v1-1/train/stage1.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | data_path=None, 5 | num_frames=None, 6 | frame_interval=3, 7 | image_size=(None, None), 8 | transform_name="resize_crop", 9 | ) 10 | # IMG: 1024 (20%) 512 (30%) 256 (50%) drop (50%) 11 | bucket_config = { # 1s/it 12 | "144p": {1: (0.5, 48), 16: (1.0, 6), 32: (1.0, 3), 96: (1.0, 1)}, 13 | "256": {1: (0.5, 24), 16: (0.5, 3), 48: (0.5, 1), 64: (0.0, None)}, 14 | "240p": {16: (0.3, 2), 32: (0.3, 1), 64: (0.0, None)}, 15 | "512": {1: (0.4, 12)}, 16 | "1024": {1: (0.3, 3)}, 17 | } 18 | mask_ratios = { 19 | "identity": 0.75, 20 | "quarter_random": 0.025, 21 | "quarter_head": 0.025, 22 | "quarter_tail": 0.025, 23 | "quarter_head_tail": 0.05, 24 | "image_random": 0.025, 25 | "image_head": 0.025, 26 | "image_tail": 0.025, 27 | "image_head_tail": 0.05, 28 | } 29 | 30 | # Define acceleration 31 | num_workers = 8 32 | num_bucket_build_workers = 16 33 | dtype = "bf16" 34 | grad_checkpoint = False 35 | plugin = "zero2" 36 | sp_size = 1 37 | 38 | # Define model 39 | model = dict( 40 | type="STDiT2-XL/2", 41 | from_pretrained=None, 42 | input_sq_size=512, # pretrained model is trained on 512x512 43 | qk_norm=True, 44 | qk_norm_legacy=True, 45 | enable_flash_attn=True, 46 | enable_layernorm_kernel=True, 47 | ) 48 | vae = dict( 49 | type="VideoAutoencoderKL", 50 | from_pretrained="stabilityai/sd-vae-ft-ema", 51 | micro_batch_size=4, 52 | local_files_only=True, 53 | ) 54 | text_encoder = dict( 55 | type="t5", 56 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 57 | model_max_length=200, 58 | shardformer=True, 59 | local_files_only=True, 60 | ) 61 | scheduler = dict( 62 | type="iddpm", 63 | timestep_respacing="", 64 | ) 65 | 66 | # Others 67 | seed = 42 68 | outputs = "outputs" 69 | wandb = False 70 | 71 | epochs = 1000 72 | log_every = 10 73 | ckpt_every = 500 74 | load = None 75 | 76 | batch_size = None 77 | lr = 2e-5 78 | grad_clip = 1.0 79 | -------------------------------------------------------------------------------- /configs/opensora-v1-1/train/stage2.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | data_path=None, 5 | num_frames=None, 6 | frame_interval=3, 7 | image_size=(None, None), 8 | transform_name="resize_crop", 9 | ) 10 | bucket_config = { # 7s/it 11 | "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)}, 12 | "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)}, 13 | "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)}, 14 | "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)}, 15 | "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)}, 16 | "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)}, 17 | "1024": {1: (0.3, 20)}, 18 | "1080p": {1: (0.4, 8)}, 19 | } 20 | mask_ratios = { 21 | "identity": 0.75, 22 | "quarter_random": 0.025, 23 | "quarter_head": 0.025, 24 | "quarter_tail": 0.025, 25 | "quarter_head_tail": 0.05, 26 | "image_random": 0.025, 27 | "image_head": 0.025, 28 | "image_tail": 0.025, 29 | "image_head_tail": 0.05, 30 | } 31 | 32 | # Define acceleration 33 | num_workers = 8 34 | num_bucket_build_workers = 16 35 | dtype = "bf16" 36 | grad_checkpoint = True 37 | plugin = "zero2" 38 | sp_size = 1 39 | 40 | # Define model 41 | model = dict( 42 | type="STDiT2-XL/2", 43 | from_pretrained=None, 44 | input_sq_size=512, # pretrained model is trained on 512x512 45 | qk_norm=True, 46 | qk_norm_legacy=True, 47 | enable_flash_attn=True, 48 | enable_layernorm_kernel=True, 49 | ) 50 | vae = dict( 51 | type="VideoAutoencoderKL", 52 | from_pretrained="stabilityai/sd-vae-ft-ema", 53 | micro_batch_size=4, 54 | local_files_only=True, 55 | ) 56 | text_encoder = dict( 57 | type="t5", 58 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 59 | model_max_length=200, 60 | shardformer=True, 61 | local_files_only=True, 62 | ) 63 | scheduler = dict( 64 | type="iddpm", 65 | timestep_respacing="", 66 | ) 67 | 68 | # Others 69 | seed = 42 70 | outputs = "outputs" 71 | wandb = False 72 | 73 | epochs = 1000 74 | log_every = 10 75 | ckpt_every = 500 76 | load = None 77 | 78 | batch_size = None 79 | lr = 2e-5 80 | grad_clip = 1.0 81 | -------------------------------------------------------------------------------- /configs/opensora-v1-1/train/stage3.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | data_path=None, 5 | num_frames=None, 6 | frame_interval=3, 7 | image_size=(None, None), 8 | transform_name="resize_crop", 9 | ) 10 | bucket_config = { # 13s/it 11 | "144p": {1: (1.0, 200), 16: (1.0, 36), 32: (1.0, 18), 64: (1.0, 9), 128: (1.0, 4)}, 12 | "256": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 11), 64: (0.5, 6), 128: (0.8, 4)}, 13 | "240p": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 10), 64: (0.5, 6), 128: (0.5, 3)}, 14 | "360p": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.5, 1)}, 15 | "512": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.8, 1)}, 16 | "480p": {1: (0.4, 80), 16: (0.6, 6), 32: (0.6, 3), 64: (0.6, 1), 128: (0.0, None)}, 17 | "720p": {1: (0.4, 40), 16: (0.6, 3), 32: (0.6, 1), 96: (0.0, None)}, 18 | "1024": {1: (0.3, 40)}, 19 | } 20 | mask_ratios = { 21 | "identity": 0.75, 22 | "quarter_random": 0.025, 23 | "quarter_head": 0.025, 24 | "quarter_tail": 0.025, 25 | "quarter_head_tail": 0.05, 26 | "image_random": 0.025, 27 | "image_head": 0.025, 28 | "image_tail": 0.025, 29 | "image_head_tail": 0.05, 30 | } 31 | 32 | # Define acceleration 33 | num_workers = 8 34 | num_bucket_build_workers = 16 35 | dtype = "bf16" 36 | grad_checkpoint = True 37 | plugin = "zero2" 38 | sp_size = 1 39 | 40 | # Define model 41 | model = dict( 42 | type="STDiT2-XL/2", 43 | from_pretrained=None, 44 | input_sq_size=512, # pretrained model is trained on 512x512 45 | qk_norm=True, 46 | qk_norm_legacy=True, 47 | enable_flash_attn=True, 48 | enable_layernorm_kernel=True, 49 | ) 50 | vae = dict( 51 | type="VideoAutoencoderKL", 52 | from_pretrained="stabilityai/sd-vae-ft-ema", 53 | micro_batch_size=4, 54 | local_files_only=True, 55 | ) 56 | text_encoder = dict( 57 | type="t5", 58 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 59 | model_max_length=200, 60 | shardformer=True, 61 | local_files_only=True, 62 | ) 63 | scheduler = dict( 64 | type="iddpm", 65 | timestep_respacing="", 66 | ) 67 | 68 | # Others 69 | seed = 42 70 | outputs = "outputs" 71 | wandb = False 72 | 73 | epochs = 1000 74 | log_every = 10 75 | ckpt_every = 500 76 | load = None 77 | 78 | batch_size = None 79 | lr = 2e-5 80 | grad_clip = 1.0 81 | -------------------------------------------------------------------------------- /configs/opensora-v1-1/train/video.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | data_path=None, 5 | num_frames=None, 6 | frame_interval=3, 7 | image_size=(None, None), 8 | transform_name="resize_crop", 9 | ) 10 | bucket_config = { # 6s/it 11 | "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)}, 12 | "256": {1: (1.0, 256)}, 13 | "512": {1: (0.5, 80)}, 14 | "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)}, 15 | "720p": {16: (0.1, 2), 32: (0.0, None)}, # No examples now 16 | "1024": {1: (0.3, 20)}, 17 | "1080p": {1: (0.3, 8)}, 18 | } 19 | 20 | # Define acceleration 21 | num_workers = 4 22 | num_bucket_build_workers = 16 23 | dtype = "bf16" 24 | grad_checkpoint = True 25 | plugin = "zero2" 26 | sp_size = 1 27 | 28 | # Define model 29 | model = dict( 30 | type="STDiT2-XL/2", 31 | from_pretrained=None, 32 | input_sq_size=512, # pretrained model is trained on 512x512 33 | qk_norm=True, 34 | qk_norm_legacy=True, 35 | enable_flash_attn=True, 36 | enable_layernorm_kernel=True, 37 | ) 38 | vae = dict( 39 | type="VideoAutoencoderKL", 40 | from_pretrained="stabilityai/sd-vae-ft-ema", 41 | micro_batch_size=4, 42 | local_files_only=True, 43 | ) 44 | text_encoder = dict( 45 | type="t5", 46 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 47 | model_max_length=200, 48 | shardformer=True, 49 | local_files_only=True, 50 | ) 51 | scheduler = dict( 52 | type="iddpm", 53 | timestep_respacing="", 54 | ) 55 | 56 | # Others 57 | seed = 42 58 | outputs = "outputs" 59 | wandb = False 60 | 61 | epochs = 1000 62 | log_every = 10 63 | ckpt_every = 500 64 | load = None 65 | 66 | batch_size = 10 # only for logging 67 | lr = 2e-5 68 | grad_clip = 1.0 69 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/inference/sample.py: -------------------------------------------------------------------------------- 1 | resolution = "240p" 2 | aspect_ratio = "9:16" 3 | num_frames = 51 4 | fps = 24 5 | frame_interval = 1 6 | save_fps = 24 7 | 8 | save_dir = "./samples/samples/" 9 | seed = 42 10 | batch_size = 1 11 | multi_resolution = "STDiT2" 12 | dtype = "bf16" 13 | condition_frame_length = 5 14 | align = 5 15 | 16 | model = dict( 17 | type="STDiT3-XL/2", 18 | from_pretrained="hpcai-tech/OpenSora-STDiT-v3", 19 | qk_norm=True, 20 | enable_flash_attn=True, 21 | enable_layernorm_kernel=True, 22 | ) 23 | vae = dict( 24 | type="OpenSoraVAE_V1_2", 25 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 26 | micro_frame_size=17, 27 | micro_batch_size=4, 28 | ) 29 | text_encoder = dict( 30 | type="t5", 31 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 32 | model_max_length=300, 33 | ) 34 | scheduler = dict( 35 | type="rflow", 36 | use_timestep_transform=True, 37 | num_sampling_steps=30, 38 | cfg_scale=7.0, 39 | ) 40 | 41 | aes = 6.5 42 | flow = None 43 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/inference/sample_hf.py: -------------------------------------------------------------------------------- 1 | resolution = "240p" 2 | aspect_ratio = "9:16" 3 | num_frames = 51 4 | fps = 24 5 | frame_interval = 1 6 | save_fps = 24 7 | 8 | save_dir = "./samples/samples/" 9 | seed = 42 10 | batch_size = 1 11 | multi_resolution = "STDiT2" 12 | dtype = "bf16" 13 | condition_frame_length = 5 14 | align = 5 15 | 16 | model = dict( 17 | type="STDiT3-XL/2", 18 | from_pretrained="hpcai-tech/OpenSora-STDiT-v3", 19 | qk_norm=True, 20 | enable_flash_attn=True, 21 | enable_layernorm_kernel=True, 22 | force_huggingface=True, 23 | ) 24 | vae = dict( 25 | type="OpenSoraVAE_V1_2", 26 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 27 | micro_frame_size=17, 28 | micro_batch_size=4, 29 | force_huggingface=True, 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=300, 35 | ) 36 | scheduler = dict( 37 | type="rflow", 38 | use_timestep_transform=True, 39 | num_sampling_steps=30, 40 | cfg_scale=7.0, 41 | ) 42 | 43 | aes = 6.5 44 | flow = None 45 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/lambda/stage2.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = { # 12s/it 9 | "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, 10 | # --- 11 | "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)}, 12 | "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)}, 13 | # --- 14 | "360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)}, 15 | "512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)}, 16 | # --- 17 | "480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)}, 18 | # --- 19 | "720p": {1: (0.1, 36), 51: (0.03, 1)}, 20 | "1024": {1: (0.1, 36), 51: (0.02, 1)}, 21 | # --- 22 | "1080p": {1: (0.01, 5)}, 23 | # --- 24 | "2048": {1: (0.01, 5)}, 25 | } 26 | 27 | grad_checkpoint = True 28 | 29 | # Acceleration settings 30 | num_workers = 8 31 | num_bucket_build_workers = 16 32 | #prefetch_factor = 3 33 | dtype = "bf16" 34 | plugin = "zero2" 35 | 36 | # Model settings 37 | model = dict( 38 | type="STDiT3-XL/2", 39 | from_pretrained=None, 40 | qk_norm=True, 41 | enable_flash_attn=True, 42 | enable_layernorm_kernel=True, 43 | freeze_y_embedder=True, 44 | ) 45 | vae = dict( 46 | type="OpenSoraVAE_V1_2", 47 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 48 | micro_frame_size=17, 49 | micro_batch_size=4, 50 | ) 51 | text_encoder = dict( 52 | type="t5", 53 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 54 | model_max_length=300, 55 | shardformer=True, 56 | ) 57 | scheduler = dict( 58 | type="rflow", 59 | use_timestep_transform=True, 60 | sample_method="logit-normal", 61 | ) 62 | 63 | # Mask settings 64 | # 25% 65 | mask_ratios = { 66 | "random": 0.005, 67 | "intepolate": 0.002, 68 | "quarter_random": 0.007, 69 | "quarter_head": 0.002, 70 | "quarter_tail": 0.002, 71 | "quarter_head_tail": 0.002, 72 | "image_random": 0.0, 73 | "image_head": 0.22, 74 | "image_tail": 0.005, 75 | "image_head_tail": 0.005, 76 | } 77 | 78 | 79 | # Log settings 80 | seed = 42 81 | outputs = "outputs_speedrun" 82 | wandb = True 83 | epochs = 5 84 | log_every = 10 85 | ckpt_every = 100 86 | 87 | # optimization settings 88 | load = None 89 | grad_clip = 1.0 90 | lr = 0.00016 91 | ema_decay = 0.99 92 | adam_eps = 1e-15 93 | warmup_steps = 400 94 | weight_decay = 0.01 -------------------------------------------------------------------------------- /configs/opensora-v1-2/lambda/stage3.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = { # 20s/it 9 | "144p": {1: (1.0, 475), 51: (1.0, 51), 102: (1.0, 27), 204: (1.0, 13), 408: (1.0, 6)}, 10 | # --- 11 | "256": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.5), 2)}, 12 | "240p": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.4), 2)}, 13 | # --- 14 | "360p": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.3), 1)}, 15 | "512": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.2), 1)}, 16 | # --- 17 | "480p": {1: (1.0, 89), 51: (0.5, 5), 102: (0.5, 3), 204: ((0.5, 0.5), 1), 408: (0.0, None)}, 18 | # --- 19 | "720p": {1: (0.3, 36), 51: (0.2, 2), 102: (0.1, 1), 204: (0.0, None)}, 20 | "1024": {1: (0.3, 36), 51: (0.1, 2), 102: (0.1, 1), 204: (0.0, None)}, 21 | # --- 22 | "1080p": {1: (0.1, 5)}, 23 | # --- 24 | "2048": {1: (0.05, 5)}, 25 | } 26 | 27 | grad_checkpoint = True 28 | 29 | # Acceleration settings 30 | num_workers = 8 31 | num_bucket_build_workers = 16 32 | #prefetch_factor = 2 33 | dtype = "bf16" 34 | plugin = "zero2" 35 | 36 | # Model settings 37 | model = dict( 38 | type="STDiT3-XL/2", 39 | from_pretrained=None, 40 | qk_norm=True, 41 | enable_flash_attn=True, 42 | enable_layernorm_kernel=True, 43 | freeze_y_embedder=True, 44 | ) 45 | vae = dict( 46 | type="OpenSoraVAE_V1_2", 47 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 48 | micro_frame_size=17, 49 | micro_batch_size=4, 50 | ) 51 | text_encoder = dict( 52 | type="t5", 53 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 54 | model_max_length=300, 55 | shardformer=True, 56 | ) 57 | scheduler = dict( 58 | type="rflow", 59 | use_timestep_transform=True, 60 | sample_method="logit-normal", 61 | ) 62 | 63 | # Mask settings 64 | # 25% 65 | mask_ratios = { 66 | "random": 0.01, 67 | "intepolate": 0.002, 68 | "quarter_random": 0.002, 69 | "quarter_head": 0.002, 70 | "quarter_tail": 0.002, 71 | "quarter_head_tail": 0.002, 72 | "image_random": 0.0, 73 | "image_head": 0.22, 74 | "image_tail": 0.005, 75 | "image_head_tail": 0.005, 76 | } 77 | 78 | # Log settings 79 | seed = 42 80 | outputs = "outputs_speedrun" 81 | wandb = True 82 | epochs = 5 83 | log_every = 10 84 | ckpt_every = 100 85 | 86 | # optimization settings 87 | load = None 88 | grad_clip = 1.0 89 | lr = 2e-4 90 | ema_decay = 0.99 91 | adam_eps = 1e-15 92 | warmup_steps = 1000 93 | weight_decay = 0.01 94 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/lambda/stage4.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = { # 12s/it 9 | "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, 10 | # --- 11 | "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)}, 12 | "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)}, 13 | # --- 14 | "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)}, 15 | "512": {1: (0.1, 141)}, 16 | # --- 17 | "480p": {1: (0.1, 89)}, 18 | # --- 19 | "720p": {1: (0.05, 36)}, 20 | "1024": {1: (0.05, 36)}, 21 | # --- 22 | "1080p": {1: (0.1, 5)}, 23 | # --- 24 | "2048": {1: (0.1, 5)}, 25 | } 26 | 27 | grad_checkpoint = True 28 | 29 | # Acceleration settings 30 | num_workers = 8 31 | num_bucket_build_workers = 16 32 | #prefetch_factor = 2 33 | dtype = "bf16" 34 | plugin = "zero2" 35 | 36 | # Model settings 37 | model = dict( 38 | type="STDiT3-XL/2", 39 | from_pretrained=None, 40 | qk_norm=True, 41 | enable_flash_attn=True, 42 | enable_layernorm_kernel=True, 43 | freeze_y_embedder=True, 44 | ) 45 | vae = dict( 46 | type="OpenSoraVAE_V1_2", 47 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 48 | micro_frame_size=17, 49 | micro_batch_size=4, 50 | ) 51 | text_encoder = dict( 52 | type="t5", 53 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 54 | model_max_length=300, 55 | shardformer=True, 56 | ) 57 | scheduler = dict( 58 | type="rflow", 59 | use_timestep_transform=True, 60 | sample_method="logit-normal", 61 | ) 62 | 63 | # Mask settings 64 | mask_ratios = { 65 | "random": 0.0 66 | } 67 | 68 | # Log settings 69 | seed = 42 70 | outputs = "outputs_speedrun" 71 | wandb = True 72 | epochs = 10 73 | log_every = 10 74 | ckpt_every = 100 75 | 76 | # optimization settings 77 | load = None 78 | grad_clip = 1.0 79 | ema_decay = 0.99 80 | adam_eps = 1e-15 81 | weight_decay = 0.01 82 | 83 | # lr scheduler 84 | lr_schedule = "1cycle" 85 | anneal_strategy = "cos" 86 | warmup_steps = 400 87 | cooldown_steps = 400 88 | lr = 1.6e-4 89 | min_lr = 1.6e-5 90 | max_lr=6.4e-4 -------------------------------------------------------------------------------- /configs/opensora-v1-2/lambda/stage5.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = { # 12s/it 9 | "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, 10 | # --- 11 | "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)}, 12 | "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)}, 13 | # --- 14 | "360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)}, 15 | "512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)}, 16 | # --- 17 | "480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)}, 18 | # --- 19 | "720p": {1: (0.1, 36), 51: (0.03, 1)}, 20 | "1024": {1: (0.1, 36), 51: (0.02, 1)}, 21 | # --- 22 | "1080p": {1: (0.01, 5)}, 23 | # --- 24 | "2048": {1: (0.01, 5)}, 25 | } 26 | 27 | grad_checkpoint = True 28 | 29 | # Acceleration settings 30 | num_workers = 8 31 | num_bucket_build_workers = 16 32 | #prefetch_factor = 2 33 | dtype = "bf16" 34 | plugin = "zero2" 35 | 36 | # Model settings 37 | model = dict( 38 | type="STDiT3-XL/2", 39 | from_pretrained=None, 40 | qk_norm=True, 41 | enable_flash_attn=True, 42 | enable_layernorm_kernel=True, 43 | freeze_y_embedder=True, 44 | ) 45 | vae = dict( 46 | type="OpenSoraVAE_V1_2", 47 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 48 | micro_frame_size=17, 49 | micro_batch_size=4, 50 | ) 51 | text_encoder = dict( 52 | type="t5", 53 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 54 | model_max_length=300, 55 | shardformer=True, 56 | ) 57 | scheduler = dict( 58 | type="rflow", 59 | use_timestep_transform=True, 60 | sample_method="logit-normal", 61 | ) 62 | 63 | # Mask settings 64 | mask_ratios = { 65 | "random": 0.0 66 | } 67 | 68 | # Log settings 69 | seed = 42 70 | outputs = "outputs_speedrun" 71 | wandb = True 72 | epochs = 10 73 | log_every = 10 74 | ckpt_every = 100 75 | 76 | # optimization settings 77 | load = None 78 | grad_clip = 1.0 79 | ema_decay = 0.99 80 | adam_eps = 1e-15 81 | weight_decay = 0.01 82 | 83 | # lr scheduler 84 | lr_schedule = "1cycle" 85 | anneal_strategy = "cos" 86 | warmup_steps = 400 87 | cooldown_steps = 400 88 | lr = 1.6e-4 89 | min_lr = 1.6e-5 90 | max_lr=6.4e-4 -------------------------------------------------------------------------------- /configs/opensora-v1-2/lambda/stage6.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = { # 20s/it 9 | "144p": {1: (1.0, 475), 51: (1.0, 51), 102: (1.0, 27), 204: (1.0, 13), 408: (1.0, 6)}, 10 | # --- 11 | "256": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.5), 2)}, 12 | "240p": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.4), 2)}, 13 | # --- 14 | "360p": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.3), 1)}, 15 | "512": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.2), 1)}, 16 | # --- 17 | "480p": {1: (1.0, 89), 51: (0.5, 5), 102: (0.5, 3), 204: ((0.5, 0.5), 1), 408: (0.0, None)}, 18 | # --- 19 | "720p": {1: (0.3, 36), 51: (0.2, 2), 102: (0.1, 1), 204: (0.0, None)}, 20 | "1024": {1: (0.3, 36), 51: (0.1, 2), 102: (0.1, 1), 204: (0.0, None)}, 21 | # --- 22 | "1080p": {1: (0.1, 5)}, 23 | # --- 24 | "2048": {1: (0.05, 5)}, 25 | } 26 | 27 | grad_checkpoint = True 28 | 29 | # Acceleration settings 30 | num_workers = 8 31 | num_bucket_build_workers = 16 32 | #prefetch_factor = 2 33 | dtype = "bf16" 34 | plugin = "zero2" 35 | 36 | # Model settings 37 | model = dict( 38 | type="STDiT3-XL/2", 39 | from_pretrained=None, 40 | qk_norm=True, 41 | enable_flash_attn=True, 42 | enable_layernorm_kernel=True, 43 | freeze_y_embedder=True, 44 | ) 45 | vae = dict( 46 | type="OpenSoraVAE_V1_2", 47 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 48 | micro_frame_size=17, 49 | micro_batch_size=4, 50 | ) 51 | text_encoder = dict( 52 | type="t5", 53 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 54 | model_max_length=300, 55 | shardformer=True, 56 | ) 57 | scheduler = dict( 58 | type="rflow", 59 | use_timestep_transform=True, 60 | sample_method="logit-normal", 61 | ) 62 | 63 | # Mask settings 64 | mask_ratios = { 65 | "random": 0.0 66 | } 67 | 68 | # Log settings 69 | seed = 42 70 | outputs = "outputs_speedrun" 71 | wandb = True 72 | epochs = 10 73 | log_every = 10 74 | ckpt_every = 100 75 | 76 | # optimization settings 77 | load = None 78 | grad_clip = 1.0 79 | ema_decay = 0.99 80 | adam_eps = 1e-15 81 | weight_decay = 0.01 82 | 83 | # lr scheduler 84 | lr_schedule = "1cycle" 85 | anneal_strategy = "cos" 86 | warmup_steps = 400 87 | cooldown_steps = 400 88 | lr = 0.8e-4 89 | min_lr = 1.6e-5 90 | max_lr=3.2e-4 -------------------------------------------------------------------------------- /configs/opensora-v1-2/misc/eval_loss.py: -------------------------------------------------------------------------------- 1 | num_workers = 8 2 | dtype = "bf16" 3 | seed = 42 4 | num_eval_timesteps = 10 5 | 6 | # Dataset settings 7 | dataset = dict( 8 | type="VariableVideoTextDataset", 9 | transform_name="resize_crop", 10 | ) 11 | 12 | bucket_config = { 13 | "144p": {1: (None, 100), 51: (None, 30), 102: (None, 20), 204: (None, 8), 408: (None, 4)}, 14 | # --- 15 | "240p": {1: (None, 100), 51: (None, 24), 102: (None, 12), 204: (None, 4), 408: (None, 2)}, 16 | # --- 17 | "360p": {1: (None, 60), 51: (None, 12), 102: (None, 6), 204: (None, 2), 408: (None, 1)}, 18 | # --- 19 | "480p": {1: (None, 40), 51: (None, 6), 102: (None, 3), 204: (None, 1)}, 20 | # --- 21 | "720p": {1: (None, 20), 51: (None, 2), 102: (None, 1)}, 22 | # --- 23 | "1080p": {1: (None, 10)}, 24 | # --- 25 | "2048": {1: (None, 5)}, 26 | } 27 | 28 | # Model settings 29 | model = dict( 30 | type="STDiT3-XL/2", 31 | from_pretrained=None, 32 | qk_norm=True, 33 | enable_flash_attn=True, 34 | enable_layernorm_kernel=True, 35 | ) 36 | vae = dict( 37 | type="OpenSoraVAE_V1_2", 38 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 39 | micro_frame_size=17, 40 | micro_batch_size=4, 41 | local_files_only=True, 42 | ) 43 | text_encoder = dict( 44 | type="t5", 45 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 46 | model_max_length=300, 47 | local_files_only=True, 48 | ) 49 | scheduler = dict(type="rflow") 50 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/misc/extract.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = { # 12s/it 9 | "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, 10 | # --- 11 | "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)}, 12 | "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)}, 13 | # --- 14 | "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)}, 15 | "512": {1: (0.1, 141)}, 16 | # --- 17 | "480p": {1: (0.1, 89)}, 18 | # --- 19 | "720p": {1: (0.05, 36)}, 20 | "1024": {1: (0.05, 36)}, 21 | # --- 22 | "1080p": {1: (0.1, 5)}, 23 | # --- 24 | "2048": {1: (0.1, 5)}, 25 | } 26 | 27 | # Acceleration settings 28 | num_workers = 8 29 | num_bucket_build_workers = 16 30 | dtype = "bf16" 31 | seed = 42 32 | outputs = "outputs" 33 | wandb = False 34 | 35 | 36 | # Model settings 37 | model = dict( 38 | type="STDiT3-XL/2", 39 | from_pretrained="hpcai-tech/OpenSora-STDiT-v3", 40 | qk_norm=True, 41 | enable_flash_attn=True, 42 | enable_layernorm_kernel=True, 43 | ) 44 | vae = dict( 45 | type="OpenSoraVAE_V1_2", 46 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 47 | micro_frame_size=17, 48 | micro_batch_size=32, 49 | ) 50 | text_encoder = dict( 51 | type="t5", 52 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 53 | model_max_length=300, 54 | shardformer=True, 55 | local_files_only=True, 56 | ) 57 | 58 | # feature extraction settings 59 | save_text_features = True 60 | save_compressed_text_features = True 61 | bin_size = 250 # 1GB, 4195 bins 62 | log_time = False 63 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/misc/feat.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | dummy_text_feature=True, 6 | ) 7 | 8 | # webvid 9 | bucket_config = { # 12s/it 10 | "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, 11 | # --- 12 | "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)}, 13 | "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)}, 14 | # --- 15 | "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)}, 16 | "512": {1: (0.1, 141)}, 17 | # --- 18 | "480p": {1: (0.1, 89)}, 19 | # --- 20 | "720p": {1: (0.05, 36)}, 21 | "1024": {1: (0.05, 36)}, 22 | # --- 23 | "1080p": {1: (0.1, 5)}, 24 | # --- 25 | "2048": {1: (0.1, 5)}, 26 | } 27 | 28 | grad_checkpoint = True 29 | 30 | load_text_features = True 31 | 32 | # Acceleration settings 33 | num_workers = 0 34 | num_bucket_build_workers = 16 35 | dtype = "bf16" 36 | plugin = "zero2" 37 | 38 | # Model settings 39 | model = dict( 40 | type="STDiT3-XL/2", 41 | from_pretrained=None, 42 | qk_norm=True, 43 | enable_flash_attn=True, 44 | enable_layernorm_kernel=True, 45 | freeze_y_embedder=True, 46 | skip_y_embedder=True, 47 | ) 48 | vae = dict( 49 | type="OpenSoraVAE_V1_2", 50 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 51 | micro_frame_size=17, 52 | micro_batch_size=4, 53 | ) 54 | text_encoder = dict( 55 | type="t5", 56 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 57 | model_max_length=300, 58 | shardformer=True, 59 | local_files_only=True, 60 | ) 61 | scheduler = dict( 62 | type="rflow", 63 | use_timestep_transform=True, 64 | sample_method="logit-normal", 65 | ) 66 | 67 | # Mask settings 68 | mask_ratios = { 69 | "random": 0.2, 70 | "intepolate": 0.01, 71 | "quarter_random": 0.01, 72 | "quarter_head": 0.01, 73 | "quarter_tail": 0.01, 74 | "quarter_head_tail": 0.01, 75 | "image_random": 0.05, 76 | "image_head": 0.1, 77 | "image_tail": 0.05, 78 | "image_head_tail": 0.05, 79 | } 80 | 81 | # Log settings 82 | seed = 42 83 | outputs = "outputs" 84 | wandb = False 85 | epochs = 1000 86 | log_every = 10 87 | ckpt_every = 1 88 | 89 | # optimization settings 90 | load = None 91 | grad_clip = 1.0 92 | lr = 2e-4 93 | ema_decay = 0.99 94 | adam_eps = 1e-15 95 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/train/adapt.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | bucket_config = { # 2s/it 7 | "144p": {1: (0.5, 48), 34: (1.0, 2), 51: (1.0, 4), 102: (1.0, 2), 204: (1.0, 1)}, 8 | # --- 9 | "256": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)}, 10 | "240p": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)}, 11 | # --- 12 | "360p": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)}, 13 | "512": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)}, 14 | # --- 15 | "480p": {1: (0.2, 4), 17: (0.3, 1), 68: (0.0, None)}, 16 | # --- 17 | "720p": {1: (0.1, 2)}, 18 | "1024": {1: (0.1, 2)}, 19 | # --- 20 | "1080p": {1: (0.1, 1)}, 21 | } 22 | grad_checkpoint = False 23 | 24 | # Acceleration settings 25 | num_workers = 8 26 | num_bucket_build_workers = 16 27 | dtype = "bf16" 28 | plugin = "zero2" 29 | 30 | # Model settings 31 | model = dict( 32 | type="STDiT3-XL/2", 33 | from_pretrained=None, 34 | qk_norm=True, 35 | enable_flash_attn=True, 36 | enable_layernorm_kernel=True, 37 | ) 38 | vae = dict( 39 | type="OpenSoraVAE_V1_2", 40 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 41 | micro_frame_size=17, 42 | micro_batch_size=4, 43 | ) 44 | text_encoder = dict( 45 | type="t5", 46 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 47 | model_max_length=300, 48 | shardformer=True, 49 | ) 50 | scheduler = dict( 51 | type="rflow", 52 | use_timestep_transform=True, 53 | sample_method="logit-normal", 54 | ) 55 | 56 | # Mask settings 57 | mask_ratios = { 58 | "random": 0.2, 59 | "intepolate": 0.01, 60 | "quarter_random": 0.01, 61 | "quarter_head": 0.01, 62 | "quarter_tail": 0.01, 63 | "quarter_head_tail": 0.01, 64 | "image_random": 0.05, 65 | "image_head": 0.1, 66 | "image_tail": 0.05, 67 | "image_head_tail": 0.05, 68 | } 69 | 70 | # Log settings 71 | seed = 42 72 | outputs = "outputs" 73 | wandb = False 74 | epochs = 1000 75 | log_every = 10 76 | ckpt_every = 500 77 | 78 | # optimization settings 79 | load = None 80 | grad_clip = 1.0 81 | lr = 1e-4 82 | ema_decay = 0.99 83 | adam_eps = 1e-15 84 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/train/demo_360p.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = {"360p": {102: (1.0, 1)}} 9 | grad_checkpoint = True 10 | 11 | # Acceleration settings 12 | num_workers = 8 13 | num_bucket_build_workers = 16 14 | dtype = "bf16" 15 | plugin = "zero2" 16 | 17 | # Model settings 18 | model = dict( 19 | type="STDiT3-XL/2", 20 | from_pretrained=None, 21 | qk_norm=True, 22 | enable_flash_attn=True, 23 | enable_layernorm_kernel=True, 24 | freeze_y_embedder=True, 25 | ) 26 | vae = dict( 27 | type="OpenSoraVAE_V1_2", 28 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 29 | micro_frame_size=17, 30 | micro_batch_size=4, 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=300, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="rflow", 40 | use_timestep_transform=True, 41 | sample_method="logit-normal", 42 | ) 43 | 44 | # Log settings 45 | seed = 42 46 | outputs = "outputs" 47 | wandb = False 48 | epochs = 1000 49 | log_every = 10 50 | ckpt_every = 200 51 | 52 | # optimization settings 53 | load = None 54 | grad_clip = 1.0 55 | lr = 1e-4 56 | ema_decay = 0.99 57 | adam_eps = 1e-15 58 | warmup_steps = 1000 59 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/train/demo_480p.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = {"480p": {51: (0.5, 5)}} 9 | grad_checkpoint = True 10 | 11 | # Acceleration settings 12 | num_workers = 8 13 | num_bucket_build_workers = 16 14 | dtype = "bf16" 15 | plugin = "zero2" 16 | 17 | # Model settings 18 | model = dict( 19 | type="STDiT3-XL/2", 20 | from_pretrained=None, 21 | qk_norm=True, 22 | enable_flash_attn=True, 23 | enable_layernorm_kernel=True, 24 | freeze_y_embedder=True, 25 | ) 26 | vae = dict( 27 | type="OpenSoraVAE_V1_2", 28 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 29 | micro_frame_size=17, 30 | micro_batch_size=4, 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=300, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="rflow", 40 | use_timestep_transform=True, 41 | sample_method="logit-normal", 42 | ) 43 | 44 | # Log settings 45 | seed = 42 46 | outputs = "outputs" 47 | wandb = False 48 | epochs = 1000 49 | log_every = 10 50 | ckpt_every = 200 51 | 52 | # optimization settings 53 | load = None 54 | grad_clip = 1.0 55 | lr = 1e-4 56 | ema_decay = 0.99 57 | adam_eps = 1e-15 58 | warmup_steps = 1000 59 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/train/stage1_feat.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict(type="BatchFeatureDataset") 3 | grad_checkpoint = True 4 | num_workers = 4 5 | 6 | # Acceleration settings 7 | dtype = "bf16" 8 | plugin = "zero2" 9 | 10 | # Model settings 11 | model = dict( 12 | type="STDiT3-XL/2", 13 | from_pretrained=None, 14 | qk_norm=True, 15 | enable_flash_attn=True, 16 | enable_layernorm_kernel=True, 17 | freeze_y_embedder=True, 18 | skip_y_embedder=True, 19 | ) 20 | scheduler = dict( 21 | type="rflow", 22 | use_timestep_transform=True, 23 | sample_method="logit-normal", 24 | ) 25 | 26 | vae_out_channels = 4 27 | model_max_length = 300 28 | text_encoder_output_dim = 4096 29 | load_video_features = True 30 | load_text_features = True 31 | 32 | # Mask settings 33 | mask_ratios = { 34 | "random": 0.2, 35 | "intepolate": 0.01, 36 | "quarter_random": 0.01, 37 | "quarter_head": 0.01, 38 | "quarter_tail": 0.01, 39 | "quarter_head_tail": 0.01, 40 | "image_random": 0.05, 41 | "image_head": 0.1, 42 | "image_tail": 0.05, 43 | "image_head_tail": 0.05, 44 | } 45 | 46 | # Log settings 47 | seed = 42 48 | outputs = "outputs" 49 | wandb = False 50 | epochs = 1000 51 | log_every = 10 52 | ckpt_every = 500 53 | 54 | # optimization settings 55 | load = None 56 | grad_clip = 1.0 57 | lr = 2e-4 58 | ema_decay = 0.99 59 | adam_eps = 1e-15 60 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/train/stage2.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = { # 12s/it 9 | "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, 10 | # --- 11 | "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)}, 12 | "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)}, 13 | # --- 14 | "360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)}, 15 | "512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)}, 16 | # --- 17 | "480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)}, 18 | # --- 19 | "720p": {1: (0.1, 36), 51: (0.03, 1)}, 20 | "1024": {1: (0.1, 36), 51: (0.02, 1)}, 21 | # --- 22 | "1080p": {1: (0.01, 5)}, 23 | # --- 24 | "2048": {1: (0.01, 5)}, 25 | } 26 | 27 | grad_checkpoint = True 28 | 29 | # Acceleration settings 30 | num_workers = 8 31 | num_bucket_build_workers = 16 32 | dtype = "bf16" 33 | plugin = "zero2" 34 | 35 | # Model settings 36 | model = dict( 37 | type="STDiT3-XL/2", 38 | from_pretrained=None, 39 | qk_norm=True, 40 | enable_flash_attn=True, 41 | enable_layernorm_kernel=True, 42 | freeze_y_embedder=True, 43 | ) 44 | vae = dict( 45 | type="OpenSoraVAE_V1_2", 46 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 47 | micro_frame_size=17, 48 | micro_batch_size=4, 49 | ) 50 | text_encoder = dict( 51 | type="t5", 52 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 53 | model_max_length=300, 54 | shardformer=True, 55 | ) 56 | scheduler = dict( 57 | type="rflow", 58 | use_timestep_transform=True, 59 | sample_method="logit-normal", 60 | ) 61 | 62 | # Mask settings 63 | # 25% 64 | mask_ratios = { 65 | "random": 0.005, 66 | "intepolate": 0.002, 67 | "quarter_random": 0.007, 68 | "quarter_head": 0.002, 69 | "quarter_tail": 0.002, 70 | "quarter_head_tail": 0.002, 71 | "image_random": 0.0, 72 | "image_head": 0.22, 73 | "image_tail": 0.005, 74 | "image_head_tail": 0.005, 75 | } 76 | 77 | 78 | # Log settings 79 | seed = 42 80 | outputs = "outputs" 81 | wandb = False 82 | epochs = 1000 83 | log_every = 10 84 | ckpt_every = 200 85 | 86 | # optimization settings 87 | load = None 88 | grad_clip = 1.0 89 | lr = 1e-4 90 | ema_decay = 0.99 91 | adam_eps = 1e-15 92 | -------------------------------------------------------------------------------- /configs/opensora-v1-2/train/stage3.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = { # 20s/it 9 | "144p": {1: (1.0, 475), 51: (1.0, 51), 102: (1.0, 27), 204: (1.0, 13), 408: (1.0, 6)}, 10 | # --- 11 | "256": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.5), 2)}, 12 | "240p": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.4), 2)}, 13 | # --- 14 | "360p": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.3), 1)}, 15 | "512": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.2), 1)}, 16 | # --- 17 | "480p": {1: (1.0, 89), 51: (0.5, 5), 102: (0.5, 3), 204: ((0.5, 0.5), 1), 408: (0.0, None)}, 18 | # --- 19 | "720p": {1: (0.3, 36), 51: (0.2, 2), 102: (0.1, 1), 204: (0.0, None)}, 20 | "1024": {1: (0.3, 36), 51: (0.1, 2), 102: (0.1, 1), 204: (0.0, None)}, 21 | # --- 22 | "1080p": {1: (0.1, 5)}, 23 | # --- 24 | "2048": {1: (0.05, 5)}, 25 | } 26 | 27 | grad_checkpoint = True 28 | 29 | # Acceleration settings 30 | num_workers = 8 31 | num_bucket_build_workers = 16 32 | dtype = "bf16" 33 | plugin = "zero2" 34 | 35 | # Model settings 36 | model = dict( 37 | type="STDiT3-XL/2", 38 | from_pretrained=None, 39 | qk_norm=True, 40 | enable_flash_attn=True, 41 | enable_layernorm_kernel=True, 42 | freeze_y_embedder=True, 43 | ) 44 | vae = dict( 45 | type="OpenSoraVAE_V1_2", 46 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 47 | micro_frame_size=17, 48 | micro_batch_size=4, 49 | ) 50 | text_encoder = dict( 51 | type="t5", 52 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 53 | model_max_length=300, 54 | shardformer=True, 55 | ) 56 | scheduler = dict( 57 | type="rflow", 58 | use_timestep_transform=True, 59 | sample_method="logit-normal", 60 | ) 61 | 62 | # Mask settings 63 | # 25% 64 | mask_ratios = { 65 | "random": 0.01, 66 | "intepolate": 0.002, 67 | "quarter_random": 0.002, 68 | "quarter_head": 0.002, 69 | "quarter_tail": 0.002, 70 | "quarter_head_tail": 0.002, 71 | "image_random": 0.0, 72 | "image_head": 0.22, 73 | "image_tail": 0.005, 74 | "image_head_tail": 0.005, 75 | } 76 | 77 | # Log settings 78 | seed = 42 79 | outputs = "outputs" 80 | wandb = False 81 | epochs = 1000 82 | log_every = 10 83 | ckpt_every = 200 84 | 85 | # optimization settings 86 | load = None 87 | grad_clip = 1.0 88 | lr = 1e-4 89 | ema_decay = 0.99 90 | adam_eps = 1e-15 91 | warmup_steps = 1000 92 | -------------------------------------------------------------------------------- /configs/opensora/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 24 // 3 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=0.5, 9 | time_scale=1.0, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=4, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="iddpm", 26 | num_sampling_steps=100, 27 | cfg_scale=7.0, 28 | cfg_channel=3, # or None 29 | ) 30 | dtype = "bf16" 31 | 32 | # Condition 33 | prompt_path = "./assets/texts/t2v_samples.txt" 34 | prompt = None # prompt has higher priority than prompt_path 35 | 36 | # Others 37 | batch_size = 1 38 | seed = 42 39 | save_dir = "./samples/samples/" 40 | -------------------------------------------------------------------------------- /configs/opensora/inference/16x512x512-rflow.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 24 // 3 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=2, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="rflow", 26 | num_sampling_steps=10, 27 | cfg_scale=7.0, 28 | ) 29 | dtype = "bf16" 30 | 31 | # Others 32 | batch_size = 2 33 | seed = 42 34 | prompt_path = "./assets/texts/t2v_samples.txt" 35 | save_dir = "./outputs/samples/" 36 | -------------------------------------------------------------------------------- /configs/opensora/inference/16x512x512.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 24 // 3 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=2, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="iddpm", 26 | num_sampling_steps=100, 27 | cfg_scale=7.0, 28 | ) 29 | dtype = "bf16" 30 | 31 | # Others 32 | batch_size = 2 33 | seed = 42 34 | prompt_path = "./assets/texts/t2v_samples.txt" 35 | save_dir = "./samples/samples/" 36 | -------------------------------------------------------------------------------- /configs/opensora/inference/64x512x512.py: -------------------------------------------------------------------------------- 1 | num_frames = 64 2 | fps = 24 // 2 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=1.0, 9 | time_scale=2 / 3, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=128, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="iddpm", 26 | num_sampling_steps=100, 27 | cfg_scale=7.0, 28 | ) 29 | dtype = "bf16" 30 | 31 | # Others 32 | batch_size = 1 33 | seed = 42 34 | prompt_path = "./assets/texts/t2v_samples.txt" 35 | save_dir = "./samples/samples/" 36 | -------------------------------------------------------------------------------- /configs/opensora/train/16x256x256-mask.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | mask_ratios = { 27 | "identity": 0.7, 28 | "random": 0.15, 29 | "mask_head": 0.05, 30 | "mask_tail": 0.05, 31 | "mask_head_tail": 0.05, 32 | } 33 | vae = dict( 34 | type="VideoAutoencoderKL", 35 | from_pretrained="stabilityai/sd-vae-ft-ema", 36 | ) 37 | text_encoder = dict( 38 | type="t5", 39 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 40 | model_max_length=120, 41 | shardformer=True, 42 | ) 43 | scheduler = dict( 44 | type="iddpm", 45 | timestep_respacing="", 46 | ) 47 | 48 | # Others 49 | seed = 42 50 | outputs = "outputs" 51 | wandb = False 52 | 53 | epochs = 1000 54 | log_every = 10 55 | ckpt_every = 1000 56 | load = None 57 | 58 | batch_size = 8 59 | lr = 2e-5 60 | grad_clip = 1.0 61 | -------------------------------------------------------------------------------- /configs/opensora/train/16x256x256-spee-rflow.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | # from_pretrained="PixArt-XL-2-512x512.pth", 23 | # from_pretrained = "/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/OpenSora-v1-HQ-16x512x512.pth", 24 | # from_pretrained = "OpenSora-v1-HQ-16x512x512.pth", 25 | from_pretrained="PRETRAINED_MODEL", 26 | enable_flash_attn=True, 27 | enable_layernorm_kernel=True, 28 | ) 29 | # mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07] 30 | # mask_ratios = { 31 | # "identity": 0.9, 32 | # "random": 0.06, 33 | # "mask_head": 0.01, 34 | # "mask_tail": 0.01, 35 | # "mask_head_tail": 0.02, 36 | # } 37 | vae = dict( 38 | type="VideoAutoencoderKL", 39 | from_pretrained="stabilityai/sd-vae-ft-ema", 40 | ) 41 | text_encoder = dict( 42 | type="t5", 43 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 44 | model_max_length=120, 45 | shardformer=True, 46 | ) 47 | scheduler = dict( 48 | type="rflow", 49 | # timestep_respacing="", 50 | ) 51 | 52 | # Others 53 | seed = 42 54 | outputs = "outputs" 55 | wandb = True 56 | 57 | epochs = 1 58 | log_every = 10 59 | ckpt_every = 1000 60 | load = None 61 | 62 | batch_size = 16 63 | lr = 2e-5 64 | grad_clip = 1.0 65 | -------------------------------------------------------------------------------- /configs/opensora/train/16x256x256-spee.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | mask_ratios = { 27 | "identity": 0.5, 28 | "random": 0.29, 29 | "mask_head": 0.07, 30 | "mask_tail": 0.07, 31 | "mask_head_tail": 0.07, 32 | } 33 | vae = dict( 34 | type="VideoAutoencoderKL", 35 | from_pretrained="stabilityai/sd-vae-ft-ema", 36 | ) 37 | text_encoder = dict( 38 | type="t5", 39 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 40 | model_max_length=120, 41 | shardformer=True, 42 | ) 43 | scheduler = dict( 44 | type="iddpm-speed", 45 | timestep_respacing="", 46 | ) 47 | 48 | # Others 49 | seed = 42 50 | outputs = "outputs" 51 | wandb = False 52 | 53 | epochs = 1000 54 | log_every = 10 55 | ckpt_every = 1000 56 | load = None 57 | 58 | batch_size = 8 59 | lr = 2e-5 60 | grad_clip = 1.0 61 | -------------------------------------------------------------------------------- /configs/opensora/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 0 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | ) 30 | text_encoder = dict( 31 | type="t5", 32 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 33 | model_max_length=120, 34 | shardformer=True, 35 | ) 36 | scheduler = dict( 37 | type="iddpm", 38 | timestep_respacing="", 39 | ) 40 | 41 | # Others 42 | seed = 42 43 | outputs = "outputs" 44 | wandb = False 45 | 46 | epochs = 1000 47 | log_every = 10 48 | ckpt_every = 1000 49 | load = None 50 | 51 | batch_size = 8 52 | lr = 2e-5 53 | grad_clip = 1.0 54 | -------------------------------------------------------------------------------- /configs/opensora/train/16x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=1.0, 21 | time_scale=1.0, 22 | from_pretrained=None, 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | micro_batch_size=128, 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 500 50 | load = None 51 | 52 | batch_size = 8 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /configs/opensora/train/360x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=360, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define acceleration 18 | dtype = "bf16" 19 | grad_checkpoint = True 20 | plugin = "zero2-seq" 21 | sp_size = 2 22 | 23 | # Define model 24 | model = dict( 25 | type="STDiT-XL/2", 26 | space_scale=1.0, 27 | time_scale=2 / 3, 28 | from_pretrained=None, 29 | enable_flash_attn=True, 30 | enable_layernorm_kernel=True, 31 | enable_sequence_parallelism=True, # enable sq here 32 | ) 33 | vae = dict( 34 | type="VideoAutoencoderKL", 35 | from_pretrained="stabilityai/sd-vae-ft-ema", 36 | micro_batch_size=128, 37 | ) 38 | text_encoder = dict( 39 | type="t5", 40 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 41 | model_max_length=120, 42 | shardformer=True, 43 | ) 44 | scheduler = dict( 45 | type="iddpm", 46 | timestep_respacing="", 47 | ) 48 | 49 | # Others 50 | seed = 42 51 | outputs = "outputs" 52 | wandb = False 53 | 54 | epochs = 1000 55 | log_every = 10 56 | ckpt_every = 250 57 | load = None 58 | 59 | batch_size = 1 60 | lr = 2e-5 61 | grad_clip = 1.0 62 | -------------------------------------------------------------------------------- /configs/opensora/train/64x512x512-sp.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 2 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=1.0, 21 | time_scale=2 / 3, 22 | from_pretrained=None, 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | enable_sequence_parallelism=True, # enable sq here 26 | ) 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="stabilityai/sd-vae-ft-ema", 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 1000 50 | load = None 51 | 52 | batch_size = 1 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /configs/opensora/train/64x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=64, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=1.0, 21 | time_scale=2 / 3, 22 | from_pretrained=None, 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | micro_batch_size=64, 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 250 50 | load = None 51 | 52 | batch_size = 4 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /configs/pixart/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=0.5, 9 | time_scale=1.0, 10 | from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt", 11 | ) 12 | vae = dict( 13 | type="VideoAutoencoderKL", 14 | from_pretrained="stabilityai/sd-vae-ft-ema", 15 | ) 16 | text_encoder = dict( 17 | type="t5", 18 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 19 | model_max_length=120, 20 | ) 21 | scheduler = dict( 22 | type="dpm-solver", 23 | num_sampling_steps=20, 24 | cfg_scale=7.0, 25 | ) 26 | dtype = "bf16" 27 | 28 | # Others 29 | batch_size = 2 30 | seed = 42 31 | prompt_path = "./assets/texts/t2v_samples.txt" 32 | save_dir = "./samples/samples/" 33 | -------------------------------------------------------------------------------- /configs/pixart/inference/1x1024MS.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (1920, 512) 4 | multi_resolution = "PixArtMS" 5 | 6 | # Define model 7 | model = dict( 8 | type="PixArtMS-XL/2", 9 | space_scale=2.0, 10 | time_scale=1.0, 11 | no_temporal_pos_emb=True, 12 | from_pretrained="PixArt-XL-2-1024-MS.pth", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | ) 18 | text_encoder = dict( 19 | type="t5", 20 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 21 | model_max_length=120, 22 | ) 23 | scheduler = dict( 24 | type="dpm-solver", 25 | num_sampling_steps=20, 26 | cfg_scale=7.0, 27 | ) 28 | dtype = "bf16" 29 | 30 | # Others 31 | batch_size = 2 32 | seed = 42 33 | prompt_path = "./assets/texts/t2i_samples.txt" 34 | save_dir = "./samples/samples/" 35 | -------------------------------------------------------------------------------- /configs/pixart/inference/1x20481B.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (2560, 1536) 4 | # image_size = (2048, 2048) 5 | 6 | model = dict( 7 | type="PixArt-1B/2", 8 | from_pretrained="PixArt-1B-2.pth", 9 | space_scale=4, 10 | no_temporal_pos_emb=True, 11 | enable_flash_attn=True, 12 | enable_layernorm_kernel=True, 13 | base_size=2048 // 8, 14 | ) 15 | vae = dict( 16 | type="VideoAutoencoderKL", 17 | from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", 18 | subfolder="vae", 19 | ) 20 | text_encoder = dict( 21 | type="t5", 22 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 23 | model_max_length=300, 24 | ) 25 | scheduler = dict( 26 | type="dpm-solver", 27 | num_sampling_steps=14, 28 | cfg_scale=4.5, 29 | ) 30 | dtype = "bf16" 31 | 32 | # Others 33 | batch_size = 1 34 | seed = 42 35 | prompt_path = "./assets/texts/t2i_sigma.txt" 36 | save_dir = "./samples/samples/" 37 | -------------------------------------------------------------------------------- /configs/pixart/inference/1x2048MS.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | # image_size = (2560, 1536) 4 | # image_size = (2048, 2048) 5 | 6 | model = dict( 7 | type="PixArt-XL/2", 8 | from_pretrained="PixArt-Sigma-XL-2-2K-MS.pth", 9 | space_scale=4, 10 | no_temporal_pos_emb=True, 11 | enable_flash_attn=True, 12 | enable_layernorm_kernel=True, 13 | base_size=2048 // 8, 14 | ) 15 | vae = dict( 16 | type="VideoAutoencoderKL", 17 | from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", 18 | subfolder="vae", 19 | scaling_factor=0.13025, 20 | ) 21 | text_encoder = dict( 22 | type="t5", 23 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 24 | model_max_length=300, 25 | ) 26 | scheduler = dict( 27 | type="dpm-solver", 28 | num_sampling_steps=14, 29 | cfg_scale=4.5, 30 | ) 31 | dtype = "bf16" 32 | 33 | # Others 34 | batch_size = 1 35 | seed = 42 36 | prompt_path = "./assets/texts/t2i_sigma.txt" 37 | save_dir = "./samples/samples/" 38 | -------------------------------------------------------------------------------- /configs/pixart/inference/1x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | no_temporal_pos_emb=True, 11 | from_pretrained="PixArt-XL-2-256x256.pth", 12 | ) 13 | vae = dict( 14 | type="VideoAutoencoderKL", 15 | from_pretrained="stabilityai/sd-vae-ft-ema", 16 | ) 17 | text_encoder = dict( 18 | type="t5", 19 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 20 | model_max_length=120, 21 | ) 22 | scheduler = dict( 23 | type="dpm-solver", 24 | num_sampling_steps=20, 25 | cfg_scale=7.0, 26 | ) 27 | dtype = "bf16" 28 | 29 | # Others 30 | batch_size = 2 31 | seed = 42 32 | prompt_path = "./assets/texts/t2i_samples.txt" 33 | save_dir = "./samples/samples/" 34 | -------------------------------------------------------------------------------- /configs/pixart/inference/1x512x512-rflow.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | no_temporal_pos_emb=True, 11 | from_pretrained="PRETRAINED_MODEL", 12 | ) 13 | vae = dict( 14 | type="VideoAutoencoderKL", 15 | from_pretrained="stabilityai/sd-vae-ft-ema", 16 | ) 17 | text_encoder = dict( 18 | type="t5", 19 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 20 | model_max_length=120, 21 | ) 22 | scheduler = dict( 23 | type="rflow", 24 | num_sampling_steps=20, 25 | cfg_scale=7.0, 26 | ) 27 | dtype = "bf16" 28 | 29 | # prompt_path = "./assets/texts/t2i_samples.txt" 30 | prompt = [ 31 | "Pirate ship trapped in a cosmic maelstrom nebula.", 32 | "A small cactus with a happy face in the Sahara desert.", 33 | "A small cactus with a sad face in the Sahara desert.", 34 | ] 35 | 36 | # Others 37 | batch_size = 2 38 | seed = 42 39 | save_dir = "./outputs/samples2/" 40 | -------------------------------------------------------------------------------- /configs/pixart/inference/1x512x512.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | no_temporal_pos_emb=True, 11 | from_pretrained="PixArt-XL-2-512x512.pth", 12 | ) 13 | vae = dict( 14 | type="VideoAutoencoderKL", 15 | from_pretrained="stabilityai/sd-vae-ft-ema", 16 | ) 17 | text_encoder = dict( 18 | type="t5", 19 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 20 | model_max_length=120, 21 | ) 22 | scheduler = dict( 23 | type="dpm-solver", 24 | num_sampling_steps=20, 25 | cfg_scale=7.0, 26 | ) 27 | dtype = "bf16" 28 | 29 | # prompt_path = "./assets/texts/t2i_samples.txt" 30 | prompt = [ 31 | "Pirate ship trapped in a cosmic maelstrom nebula.", 32 | "A small cactus with a happy face in the Sahara desert.", 33 | "A small cactus with a sad face in the Sahara desert.", 34 | ] 35 | 36 | # Others 37 | batch_size = 2 38 | seed = 42 39 | save_dir = "./samples/samples/" 40 | -------------------------------------------------------------------------------- /configs/pixart/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | ) 30 | text_encoder = dict( 31 | type="t5", 32 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 33 | model_max_length=120, 34 | shardformer=True, 35 | ) 36 | scheduler = dict( 37 | type="iddpm", 38 | timestep_respacing="", 39 | ) 40 | 41 | # Others 42 | seed = 42 43 | outputs = "outputs" 44 | wandb = False 45 | 46 | epochs = 1000 47 | log_every = 10 48 | ckpt_every = 1000 49 | load = None 50 | 51 | batch_size = 8 52 | lr = 2e-5 53 | grad_clip = 1.0 54 | -------------------------------------------------------------------------------- /configs/pixart/train/1x2048x2048.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path="/home/zhaowangbo/data/csv/image-v1_1_ext_noempty_rcp_clean_info.csv", 5 | num_frames=1, 6 | frame_interval=3, 7 | image_size=(2048, 2048), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-1B/2", 20 | space_scale=4.0, 21 | no_temporal_pos_emb=True, 22 | from_pretrained="PixArt-1B-2.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", 30 | subfolder="vae", 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=300, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 1000 50 | load = None 51 | 52 | batch_size = 4 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /configs/pixart/train/1x512x512-rflow.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=1, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-XL/2", 20 | space_scale=1.0, 21 | time_scale=1.0, 22 | no_temporal_pos_emb=True, 23 | # from_pretrained="PixArt-XL-2-512x512.pth", 24 | from_pretrained="PRETRAINED_MODEL", 25 | enable_flash_attn=True, 26 | enable_layernorm_kernel=True, 27 | ) 28 | vae = dict( 29 | type="VideoAutoencoderKL", 30 | from_pretrained="stabilityai/sd-vae-ft-ema", 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=120, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="rflow", 40 | # timestep_respacing="", 41 | ) 42 | 43 | # Others 44 | seed = 42 45 | outputs = "outputs" 46 | wandb = True 47 | 48 | epochs = 2 49 | log_every = 10 50 | ckpt_every = 1000 51 | load = None 52 | 53 | batch_size = 64 54 | lr = 2e-5 55 | grad_clip = 1.0 56 | -------------------------------------------------------------------------------- /configs/pixart/train/1x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=1, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-XL/2", 20 | space_scale=1.0, 21 | time_scale=1.0, 22 | no_temporal_pos_emb=True, 23 | from_pretrained="PixArt-XL-2-512x512.pth", 24 | enable_flash_attn=True, 25 | enable_layernorm_kernel=True, 26 | ) 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="stabilityai/sd-vae-ft-ema", 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 1000 50 | load = None 51 | 52 | batch_size = 32 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /configs/pixart/train/64x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=64, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | 18 | # Define model 19 | model = dict( 20 | type="PixArt-XL/2", 21 | space_scale=1.0, 22 | time_scale=2 / 3, 23 | from_pretrained=None, 24 | enable_flash_attn=True, 25 | enable_layernorm_kernel=True, 26 | ) 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="stabilityai/sd-vae-ft-ema", 30 | micro_batch_size=128, 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=120, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="iddpm", 40 | timestep_respacing="", 41 | ) 42 | 43 | # Others 44 | seed = 42 45 | outputs = "outputs" 46 | wandb = False 47 | 48 | epochs = 1000 49 | log_every = 10 50 | ckpt_every = 250 51 | load = None 52 | 53 | batch_size = 4 54 | lr = 2e-5 55 | grad_clip = 1.0 56 | -------------------------------------------------------------------------------- /configs/vae/inference/image.py: -------------------------------------------------------------------------------- 1 | image_size = (256, 256) 2 | num_frames = 1 3 | 4 | dtype = "bf16" 5 | batch_size = 1 6 | seed = 42 7 | save_dir = "samples/vae_video" 8 | cal_stats = True 9 | log_stats_every = 100 10 | 11 | # Define dataset 12 | dataset = dict( 13 | type="VideoTextDataset", 14 | data_path=None, 15 | num_frames=num_frames, 16 | image_size=image_size, 17 | ) 18 | num_samples = 100 19 | num_workers = 4 20 | 21 | # Define model 22 | model = dict( 23 | type="OpenSoraVAE_V1_2", 24 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 25 | micro_frame_size=None, 26 | micro_batch_size=4, 27 | cal_loss=True, 28 | ) 29 | 30 | # loss weights 31 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 32 | kl_loss_weight = 1e-6 33 | -------------------------------------------------------------------------------- /configs/vae/inference/video.py: -------------------------------------------------------------------------------- 1 | image_size = (256, 256) 2 | num_frames = 51 3 | 4 | dtype = "bf16" 5 | batch_size = 1 6 | seed = 42 7 | save_dir = "samples/vae_video" 8 | cal_stats = True 9 | log_stats_every = 100 10 | 11 | # Define dataset 12 | dataset = dict( 13 | type="VideoTextDataset", 14 | data_path=None, 15 | num_frames=num_frames, 16 | image_size=image_size, 17 | ) 18 | num_samples = 100 19 | num_workers = 4 20 | 21 | # Define model 22 | model = dict( 23 | type="OpenSoraVAE_V1_2", 24 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 25 | micro_frame_size=None, 26 | micro_batch_size=4, 27 | cal_loss=True, 28 | ) 29 | 30 | # loss weights 31 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 32 | kl_loss_weight = 1e-6 33 | -------------------------------------------------------------------------------- /configs/vae/train/stage1.py: -------------------------------------------------------------------------------- 1 | num_frames = 17 2 | image_size = (256, 256) 3 | 4 | # Define dataset 5 | dataset = dict( 6 | type="VideoTextDataset", 7 | data_path=None, 8 | num_frames=num_frames, 9 | frame_interval=1, 10 | image_size=image_size, 11 | ) 12 | 13 | # Define acceleration 14 | num_workers = 16 15 | dtype = "bf16" 16 | grad_checkpoint = True 17 | plugin = "zero2" 18 | 19 | # Define model 20 | model = dict( 21 | type="OpenSoraVAE_V1_2", 22 | freeze_vae_2d=True, 23 | from_pretrained=None, 24 | cal_loss=True, 25 | ) 26 | 27 | # loss weights 28 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 29 | kl_loss_weight = 1e-6 30 | 31 | mixed_strategy = "mixed_video_image" 32 | mixed_image_ratio = 0.2 33 | use_real_rec_loss = False 34 | use_z_rec_loss = True 35 | use_image_identity_loss = True 36 | 37 | # Others 38 | seed = 42 39 | outputs = "outputs/vae_stage1" 40 | wandb = False 41 | 42 | epochs = 100 # NOTE: adjust accordingly w.r.t dataset size 43 | log_every = 1 44 | ckpt_every = 1000 45 | load = None 46 | 47 | batch_size = 1 48 | lr = 1e-5 49 | grad_clip = 1.0 50 | -------------------------------------------------------------------------------- /configs/vae/train/stage2.py: -------------------------------------------------------------------------------- 1 | num_frames = 17 2 | image_size = (256, 256) 3 | 4 | # Define dataset 5 | dataset = dict( 6 | type="VideoTextDataset", 7 | data_path=None, 8 | num_frames=num_frames, 9 | frame_interval=1, 10 | image_size=image_size, 11 | ) 12 | 13 | # Define acceleration 14 | num_workers = 16 15 | dtype = "bf16" 16 | grad_checkpoint = True 17 | plugin = "zero2" 18 | 19 | # Define model 20 | model = dict( 21 | type="OpenSoraVAE_V1_2", 22 | freeze_vae_2d=False, 23 | from_pretrained="outputs/vae_stage1", 24 | cal_loss=True, 25 | ) 26 | 27 | # loss weights 28 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 29 | kl_loss_weight = 1e-6 30 | 31 | mixed_strategy = "mixed_video_image" 32 | mixed_image_ratio = 0.2 33 | use_real_rec_loss = False 34 | use_z_rec_loss = True 35 | use_image_identity_loss = False 36 | 37 | # Others 38 | seed = 42 39 | outputs = "outputs/vae_stage2" 40 | wandb = False 41 | 42 | epochs = 100 # NOTE: adjust accordingly w.r.t dataset size 43 | log_every = 1 44 | ckpt_every = 1000 45 | load = None 46 | 47 | batch_size = 1 48 | lr = 1e-5 49 | grad_clip = 1.0 50 | -------------------------------------------------------------------------------- /configs/vae/train/stage3.py: -------------------------------------------------------------------------------- 1 | num_frames = 33 2 | image_size = (256, 256) 3 | 4 | # Define dataset 5 | dataset = dict( 6 | type="VideoTextDataset", 7 | data_path=None, 8 | num_frames=num_frames, 9 | frame_interval=1, 10 | image_size=image_size, 11 | ) 12 | 13 | # Define acceleration 14 | num_workers = 16 15 | dtype = "bf16" 16 | grad_checkpoint = True 17 | plugin = "zero2" 18 | 19 | # Define model 20 | model = dict( 21 | type="OpenSoraVAE_V1_2", 22 | freeze_vae_2d=False, 23 | from_pretrained="outputs/vae_stage2", 24 | cal_loss=True, 25 | ) 26 | 27 | # loss weights 28 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 29 | kl_loss_weight = 1e-6 30 | 31 | mixed_strategy = "mixed_video_random" 32 | use_real_rec_loss = True 33 | use_z_rec_loss = False 34 | use_image_identity_loss = False 35 | 36 | # Others 37 | seed = 42 38 | outputs = "outputs/vae_stage3" 39 | wandb = False 40 | 41 | epochs = 100 # NOTE: adjust accordingly w.r.t dataset size 42 | log_every = 1 43 | ckpt_every = 1000 44 | load = None 45 | 46 | batch_size = 1 47 | lr = 1e-5 48 | grad_clip = 1.0 49 | -------------------------------------------------------------------------------- /docs/tutorial/.nojekyll: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/tutorial/Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source "https://rubygems.org" 4 | 5 | gem "jekyll-theme-chirpy", "~> 7.1", ">= 7.1.1" 6 | 7 | gem "html-proofer", "~> 5.0", group: :test 8 | 9 | platforms :mingw, :x64_mingw, :mswin, :jruby do 10 | gem "tzinfo", ">= 1", "< 3" 11 | gem "tzinfo-data" 12 | end 13 | 14 | gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin] 15 | -------------------------------------------------------------------------------- /docs/tutorial/_data/contact.yml: -------------------------------------------------------------------------------- 1 | # The contact options. 2 | 3 | - type: github 4 | icon: "fab fa-github" 5 | 6 | #- type: twitter 7 | # icon: "fa-brands fa-x-twitter" 8 | 9 | #- type: email 10 | # icon: "fas fa-envelope" 11 | # noblank: true # open link in current tab 12 | 13 | #- type: rss 14 | # icon: "fas fa-rss" 15 | # noblank: true 16 | # Uncomment and complete the url below to enable more contact options 17 | # 18 | # - type: mastodon 19 | # icon: 'fab fa-mastodon' # icons powered by 20 | # url: '' # Fill with your Mastodon account page, rel="me" will be applied for verification 21 | # 22 | # - type: linkedin 23 | # icon: 'fab fa-linkedin' # icons powered by 24 | # url: '' # Fill with your Linkedin homepage 25 | # 26 | # - type: stack-overflow 27 | # icon: 'fab fa-stack-overflow' 28 | # url: '' # Fill with your stackoverflow homepage 29 | # 30 | # - type: bluesky 31 | # icon: 'fa-brands fa-bluesky' 32 | # url: '' # Fill with your Bluesky profile link 33 | # 34 | # - type: reddit 35 | # icon: 'fa-brands fa-reddit' 36 | # url: '' # Fill with your Reddit profile link 37 | # 38 | # - type: threads 39 | # icon: 'fa-brands fa-threads' 40 | # url: '' # Fill with your Threads profile link 41 | -------------------------------------------------------------------------------- /docs/tutorial/_data/locales/en-customized.yml: -------------------------------------------------------------------------------- 1 | # The layout text of site 2 | 3 | # ----- Commons label ----- 4 | 5 | layout: 6 | post: Post 7 | category: Category 8 | tag: Tag 9 | 10 | # The tabs of sidebar 11 | tabs: 12 | # format: : 13 | home: Tutorial 14 | categories: Categories 15 | tags: Tags 16 | archives: Archives 17 | about: About 18 | 19 | # the text displayed in the search bar & search results 20 | search: 21 | hint: search 22 | cancel: Cancel 23 | no_results: Oops! No results found. 24 | 25 | panel: 26 | lastmod: Recently Updated 27 | trending_tags: Trending Tags 28 | toc: Contents 29 | 30 | copyright: 31 | # Shown at the bottom of the post 32 | license: 33 | template: #This post is licensed under :LICENSE_NAME by the author. 34 | name: #CC BY 4.0 35 | link: #https://creativecommons.org/licenses/by/4.0/ 36 | 37 | # Displayed in the footer 38 | brief: #Some rights reserved. 39 | verbose: #>- 40 | #Except where otherwise noted, the blog posts on this site are licensed 41 | #under the Creative Commons Attribution 4.0 International (CC BY 4.0) License by the author. 42 | 43 | meta: #Using the :PLATFORM theme :THEME 44 | 45 | not_found: 46 | statment: Sorry, we've misplaced that URL or it's pointing to something that doesn't exist. 47 | 48 | notification: 49 | update_found: A new version of content is available. 50 | update: Update 51 | 52 | # ----- Posts related labels ----- 53 | 54 | post: 55 | written_by: By 56 | posted: Posted 57 | updated: Updated 58 | words: words 59 | pageview_measure: views 60 | read_time: 61 | unit: min 62 | prompt: read 63 | relate_posts: Further Reading 64 | share: Share 65 | button: 66 | next: Newer 67 | previous: Older 68 | copy_code: 69 | succeed: Copied! 70 | share_link: 71 | title: Copy link 72 | succeed: Link copied successfully! 73 | 74 | # Date time format. 75 | # See: , 76 | df: 77 | post: 78 | strftime: "%b %e, %Y" 79 | dayjs: "ll" 80 | archives: 81 | strftime: "%b" 82 | dayjs: "MMM" 83 | 84 | # categories page 85 | categories: 86 | category_measure: 87 | singular: category 88 | plural: categories 89 | post_measure: 90 | singular: post 91 | plural: posts -------------------------------------------------------------------------------- /docs/tutorial/_data/share.yml: -------------------------------------------------------------------------------- 1 | # Sharing options at the bottom of the post. 2 | # Icons from 3 | 4 | platforms: 5 | - type: Twitter 6 | icon: "fa-brands fa-square-x-twitter" 7 | link: "https://twitter.com/intent/tweet?text=TITLE&url=URL" 8 | 9 | - type: Facebook 10 | icon: "fab fa-facebook-square" 11 | link: "https://www.facebook.com/sharer/sharer.php?title=TITLE&u=URL" 12 | 13 | - type: Telegram 14 | icon: "fab fa-telegram" 15 | link: "https://t.me/share/url?url=URL&text=TITLE" 16 | 17 | # Uncomment below if you need to. 18 | # 19 | # - type: Linkedin 20 | # icon: "fab fa-linkedin" 21 | # link: "https://www.linkedin.com/sharing/share-offsite/?url=URL" 22 | # 23 | # - type: Weibo 24 | # icon: "fab fa-weibo" 25 | # link: "https://service.weibo.com/share/share.php?title=TITLE&url=URL" 26 | # 27 | # - type: Mastodon 28 | # icon: "fa-brands fa-mastodon" 29 | # # See: https://github.com/justinribeiro/share-to-mastodon#properties 30 | # instances: 31 | # - label: mastodon.social 32 | # link: "https://mastodon.social/" 33 | # - label: mastodon.online 34 | # link: "https://mastodon.online/" 35 | # - label: fosstodon.org 36 | # link: "https://fosstodon.org/" 37 | # - label: photog.social 38 | # link: "https://photog.social/" 39 | # 40 | # - type: Bluesky 41 | # icon: "fa-brands fa-bluesky" 42 | # link: "https://bsky.app/intent/compose?text=TITLE%20URL" 43 | # 44 | # - type: Reddit 45 | # icon: "fa-brands fa-square-reddit" 46 | # link: "https://www.reddit.com/submit?url=URL&title=TITLE" 47 | # 48 | # - type: Threads 49 | # icon: "fa-brands fa-square-threads" 50 | # link: "https://www.threads.net/intent/post?text=TITLE%20URL" 51 | -------------------------------------------------------------------------------- /docs/tutorial/_includes/favicons.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/tutorial/_includes/topbar.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 |
8 | 41 | 42 | 43 | 46 | 47 |
48 | {% if page.layout == 'home' %} 49 | {{- site.data.locales[include.lang].title | default: site.title -}} 50 | {% elsif page.collection == 'tabs' or page.layout == 'page' %} 51 | {%- capture tab_key -%}{{ page.url | split: '/' }}{%- endcapture -%} 52 | {{- site.data.locales[include.lang].tabs[tab_key] | default: page.title -}} 53 | {% else %} 54 | {{- site.data.locales[include.lang].layout[page.layout] | default: page.layout | capitalize -}} 55 | {% endif %} 56 |
57 |
58 |
-------------------------------------------------------------------------------- /docs/tutorial/_plugins/details_tag.rb: -------------------------------------------------------------------------------- 1 | module Jekyll 2 | module Tags 3 | class DetailsTag < Liquid::Block 4 | 5 | def initialize(tag_name, markup, tokens) 6 | super 7 | @caption = markup 8 | end 9 | 10 | def render(context) 11 | site = context.registers[:site] 12 | converter = site.find_converter_instance(::Jekyll::Converters::Markdown) 13 | # below Jekyll 3.x use this: 14 | # converter = site.getConverterImpl(::Jekyll::Converters::Markdown) 15 | caption = converter.convert(@caption).gsub(/<\/?p[^>]*>/, '').chomp 16 | body = converter.convert(super(context)) 17 | "
#{caption}#{body}
" 18 | end 19 | 20 | end 21 | end 22 | end 23 | 24 | Liquid::Template.register_tag('details', Jekyll::Tags::DetailsTag) -------------------------------------------------------------------------------- /docs/tutorial/_plugins/posts-lastmod-hook.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # 3 | # Check for changed posts 4 | 5 | Jekyll::Hooks.register :posts, :post_init do |post| 6 | 7 | commit_num = `git rev-list --count HEAD "#{ post.path }"` 8 | 9 | if commit_num.to_i > 1 10 | lastmod_date = `git log -1 --pretty="%ad" --date=iso "#{ post.path }"` 11 | post.data['last_modified_at'] = lastmod_date 12 | end 13 | 14 | end 15 | -------------------------------------------------------------------------------- /docs/tutorial/_posts/.placeholder: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/tutorial/_tabs/repository.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: post 3 | icon: fas fa-link 4 | title: Github Repository 5 | date: 2024-10-16 6 | toc: true 7 | order: 6 8 | --- 9 | 10 | 13 | 14 | -------------------------------------------------------------------------------- /docs/tutorial/assets/css/jekyll-theme-chirpy.scss: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | 4 | @import 'main 5 | {%- if jekyll.environment == 'production' -%} 6 | .bundle 7 | {%- endif -%} 8 | '; 9 | 10 | @import 'colors/typography-dark.scss'; 11 | @import 'colors/typography-light.scss'; 12 | 13 | /* append your custom style below */ 14 | .todo { background: red ;}; 15 | 16 | .iframe-button { 17 | width: 100%; 18 | height: 150px; 19 | display: flex; 20 | justify-content: center; 21 | align-items: center; 22 | cursor: pointer; 23 | 24 | } 25 | 26 | html { 27 | font-size: 16px; 28 | 29 | @media (prefers-color-scheme: light) { 30 | &:not([data-mode]), 31 | &[data-mode='light'] { 32 | @include light-scheme; 33 | } 34 | 35 | &[data-mode='dark'] { 36 | @include dark-scheme; 37 | } 38 | } 39 | 40 | @media (prefers-color-scheme: dark) { 41 | &:not([data-mode]), 42 | &[data-mode='dark'] { 43 | @include dark-scheme; 44 | } 45 | 46 | &[data-mode='light'] { 47 | @include light-scheme; 48 | } 49 | } 50 | } -------------------------------------------------------------------------------- /docs/tutorial/assets/fails_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/docs/tutorial/assets/fails_loss.png -------------------------------------------------------------------------------- /docs/tutorial/assets/fails_weight_norm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/docs/tutorial/assets/fails_weight_norm.png -------------------------------------------------------------------------------- /docs/tutorial/assets/img/lambda-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 10 | 12 | 18 | 22 | 23 | -------------------------------------------------------------------------------- /docs/tutorial/assets/monitoring_tool.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/docs/tutorial/assets/monitoring_tool.png -------------------------------------------------------------------------------- /docs/tutorial/assets/pyspy_dump.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/docs/tutorial/assets/pyspy_dump.png -------------------------------------------------------------------------------- /docs/tutorial/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: post 3 | icon: fas fa-info-circle 4 | permalink: '/' 5 | title: "Tutorial - Let's reproduce a T2V model." 6 | date: 2024-10-02 7 | toc: true 8 | --- 9 | 10 | 13 | 14 | -------------------------------------------------------------------------------- /docs/zh_CN/datasets.md: -------------------------------------------------------------------------------- 1 | # 数据集 2 | 3 | ## 正在使用的数据集 4 | 5 | ### HD-VG-130M 6 | 7 | [HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) 包括 130M 个文本视频对。标题是 8 | 由 BLIP-2 生成。我们发现剪切和文本质量相对较差。它包含 20 个拆分。对于 OpenSora 1.0,我们使用第一个拆分。我们计划使用整个数据集并对其进行重新处理。 9 | 10 | ### Inter4k 11 | 12 | [Inter4k](https://github.com/alexandrosstergiou/Inter4K) 是一个包含分辨率为 4K 的 1k 视频剪辑的数据集。这个 13 | 数据集被提议用于超分辨率任务。我们使用数据集进行 HQ 训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。 14 | 15 | ### Pexels.com 16 | 17 | [Pexels.com](https://www.pexels.com/) 是一个提供免费库存照片和视频的网站。我们收集的 19K 视频 18 | 来自本网站的剪辑,用于高质量训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。 19 | 20 | ## 数据集监视列表 21 | 22 | 我们也在关注以下数据集,并考虑在未来使用它们,这取决于我们的存储空间以及数据集的质量。 23 | 24 | | 名称 | 大小 | 描述 | 25 | |-------------------|--------------|-------------------------------| 26 | | Panda-70M | 70M videos | High quality video-text pairs | 27 | | WebVid-10M | 10M videos | Low quality | 28 | | InternVid-10M-FLT | 10M videos | | 29 | | EGO4D | 3670 hours | | 30 | | OpenDV-YouTube | 1700 hours | | 31 | | VidProM | 6.69M videos | | 32 | -------------------------------------------------------------------------------- /docs/zh_CN/report_v1.md: -------------------------------------------------------------------------------- 1 | # Open-Sora v1 技术报告 2 | 3 | OpenAI的Sora在生成一分钟高质量视频方面非常出色。然而,它几乎没有透露任何关于其细节的信息。为了使人工智能更加“开放”,我们致力于构建一个开源版本的Sora。这份报告描述了我们第一次尝试训练一个基于Transformer的视频扩散模型。 4 | 5 | ## 选择高效的架构 6 | 7 | 为了降低计算成本,我们希望利用现有的VAE模型。Sora使用时空VAE来减少时间维度。然而,我们发现没有开源的高质量时空VAE模型。[MAGVIT](https://github.com/google-research/magvit)的4x4x4 VAE并未开源,而[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的2x4x4 VAE在我们的实验中质量较低。因此,我们决定在我们第一个版本中使用2D VAE(来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original))。 8 | 9 | 视频训练涉及大量的token。考虑到24fps的1分钟视频,我们有1440帧。通过VAE下采样4倍和patch大小下采样2倍,我们得到了1440x1024≈150万个token。在150万个token上进行全注意力计算将带来巨大的计算成本。因此,我们使用时空注意力来降低成本,这是遵循[Latte](https://github.com/Vchitect/Latte)的方法。 10 | 11 | 如图中所示,在STDiT(ST代表时空)中,我们在每个空间注意力之后立即插入一个时间注意力。这类似于Latte论文中的变种3。然而,我们并没有控制这些变体的相似数量的参数。虽然Latte的论文声称他们的变体比变种3更好,但我们在16x256x256视频上的实验表明,相同数量的迭代次数下,性能排名为:DiT(完整)> STDiT(顺序)> STDiT(并行)≈ Latte。因此,我们出于效率考虑选择了STDiT(顺序)。[这里](/docs/acceleration.md#efficient-stdit)提供了速度基准测试。 12 | 13 | 14 | ![Architecture Comparison](/assets/readme/report_arch_comp.png) 15 | 16 | 为了专注于视频生成,我们希望基于一个强大的图像生成模型来训练我们的模型。PixArt-α是一个经过高效训练的高质量图像生成模型,具有T5条件化的DiT结构。我们使用[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha)初始化我们的模型,并将插入的时间注意力的投影层初始化为零。这种初始化在开始时保留了模型的图像生成能力,而Latte的架构则不能。插入的注意力将参数数量从5.8亿增加到7.24亿。 17 | 18 | ![Architecture](/assets/readme/report_arch.jpg) 19 | 20 | 借鉴PixArt-α和Stable Video Diffusion的成功,我们还采用了渐进式训练策略:在366K预训练数据集上进行16x256x256的训练,然后在20K数据集上进行16x256x256、16x512x512和64x512x512的训练。通过扩展位置嵌入,这一策略极大地降低了计算成本。 21 | 22 | 我们还尝试在DiT中使用3D patch嵌入器。然而,在时间维度上2倍下采样后,生成的视频质量较低。因此,我们将在下一版本中将下采样留给时间VAE。目前,我们在每3帧采样一次进行16帧训练,以及在每2帧采样一次进行64帧训练。 23 | 24 | 25 | ## 数据是训练高质量模型的核心 26 | 27 | 我们发现数据的数量和质量对生成视频的质量有很大的影响,甚至比模型架构和训练策略的影响还要大。目前,我们只从[HD-VG-130M](https://github.com/daooshee/HD-VG-130M)准备了第一批分割(366K个视频片段)。这些视频的质量参差不齐,而且字幕也不够准确。因此,我们进一步从提供免费许可视频的[Pexels](https://www.pexels.com/)收集了20k相对高质量的视频。我们使用LLaVA,一个图像字幕模型,通过三个帧和一个设计好的提示来标记视频。有了设计好的提示,LLaVA能够生成高质量的字幕。 28 | 29 | ![Caption](/assets/readme/report_caption.png) 30 | 31 | 由于我们更加注重数据质量,我们准备收集更多数据,并在下一版本中构建一个视频预处理流程。 32 | 33 | ## 训练细节 34 | 35 | 在有限的训练预算下,我们只进行了一些探索。我们发现学习率1e-4过大,因此将其降低到2e-5。在进行大批量训练时,我们发现`fp16`比`bf16`不太稳定,可能会导致生成失败。因此,我们在64x512x512的训练中切换到`bf16`。对于其他超参数,我们遵循了之前的研究工作。 36 | 37 | ## 损失曲线 38 | 39 | 16x256x256 预训练损失曲线 40 | 41 | ![16x256x256 Pretraining Loss Curve](/assets/readme/report_loss_curve_1.png) 42 | 43 | 16x256x256 高质量训练损失曲线 44 | 45 | ![16x256x256 HQ Training Loss Curve](/assets/readme/report_loss_curve_2.png) 46 | 47 | 16x512x512 高质量训练损失曲线 48 | 49 | ![16x512x512 HQ Training Loss Curve](/assets/readme/report_loss_curve_3.png) 50 | -------------------------------------------------------------------------------- /docs/zh_CN/vae.md: -------------------------------------------------------------------------------- 1 | # VAE 技术报告 2 | 3 | 由于 [Pixart-Sigma](https://arxiv.org/abs/2403.04692) 论文中指出适应新的VAE很简单,因此我们开发了一个额外的时间VAE。 4 | 具体而言, 我们的VAE由一个[空间 VAE](https://huggingface.co/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers)和一个时间VA相接的形式组成. 5 | 对于时间VAE,我们遵循 [MAGVIT-v2](https://arxiv.org/abs/2310.05737)的实现, 并做了以下修改: 6 | 7 | * 我们删除了码本特有的架构。 8 | * 我们不使用鉴别​​器(discriminator),而是使用VAE重建损失、kl损失和感知损失进行训练。 9 | * 在编码器的最后一个线性层中,我们缩小到 4 通道的对角高斯分布,遵循我们之前训练的接受 4 通道输入的 STDiT。 10 | * 我们的解码器与编码器架构对称。 11 | 12 | ## 训练 13 | 我们分不同阶段训练模型。 14 | 15 | 我们首先通过在单台机器(8 个 GPU)上冻结空间 VAE 380k 步来训练时间 VAE。我们使用额外的身份损失使 3D VAE 的特征与 2D VAE 的特征相似。我们使用 20% 的图像和 80% 的视频(17 帧)来训练 VAE。 16 | 17 | ```bash 18 | torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH 19 | ``` 20 | 21 | 接下来,我们移除身份损失并训练 3D VAE 管道以重建 260k 步的 2D 压缩视频。 22 | 23 | ```bash 24 | torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH 25 | ``` 26 | 27 | 最后,我们移除了 2D 压缩视频的重建损失,并训练 VAE 管道以构建 540k 步的 3D 视频。我们在 34 帧内使用随机数训练 VAE,使其对不同长度的视频更具鲁棒性。此阶段在 24 个 GPU 上进行训练。 28 | 29 | ```bash 30 | torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH 31 | ``` 32 | 33 | 请注意,您需要根据自己的 csv 数据大小相应地调整配置文件中的 `epochs` 。 34 | 35 | ## 推理 36 | 37 | 为了直观地检查 VAE 的性能,您可以运行以下推理。它使用 `_ori` 后缀(即 `"YOUR_VIDEO_DIR"_ori`)将原始视频保存到您指定的视频目录中,使用`_rec`后缀(即`"YOUR_VIDEO_DIR"_rec`)将来自完整管道的重建视频保存到指定的视频目录中,并使用 `_spatial`后缀(即`"YOUR_VIDEO_DIR"_spatial`)将来自 2D 压缩和解压缩的重建视频保存到指定的视频目录中。 38 | 39 | ```bash 40 | torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR 41 | ``` 42 | ## 评估 43 | 然后,我们可以计算 VAE 在 SSIM、PSNR、LPIPS 和 FLOLPIPS 指标上的表现得分。 44 | 45 | * SSIM: 结构相似性指数度量,越高越好 46 | * PSNR: 峰值信噪比,越高越好 47 | * LPIPS: 学习感知图像质量下降,越低越好 48 | * [FloLPIPS](https://arxiv.org/pdf/2207.08119): 带有视频插值的LPIPS,越低越好。 49 | 50 | ```bash 51 | python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips 52 | ``` 53 | 54 | ## 致谢 55 | 我们非常感谢以下工作: 56 | * [MAGVIT-v2](https://arxiv.org/abs/2310.05737): Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation 57 | * [Taming Transformers](https://github.com/CompVis/taming-transformers): Taming Transformers for High-Resolution Image Synthesis 58 | * [3D blur pooling](https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc) 59 | * [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan) 60 | -------------------------------------------------------------------------------- /eval/human_eval/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | TEXT_PATH=/home/data/sora_data/pixart-sigma-generated/text.txt 7 | OUTPUT_PATH=/home/data/sora_data/pixart-sigma-generated/raw 8 | CMD="python scripts/inference.py configs/pixart/inference/1x2048MS.py" 9 | # LOG_BASE=logs/sample/generate 10 | LOG_BASE=$(dirname $CKPT)/eval/generate 11 | mkdir -p ${LOG_BASE} 12 | NUM_PER_GPU=10000 13 | N_LAUNCH=2 14 | NUM_START=$(($N_LAUNCH * $NUM_PER_GPU * 8)) 15 | 16 | CUDA_VISIBLE_DEVICES=0 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 0)) --end-index $(($NUM_START + $NUM_PER_GPU * 1)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_1.log 2>&1 & 17 | CUDA_VISIBLE_DEVICES=1 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 1)) --end-index $(($NUM_START + $NUM_PER_GPU * 2)) --image-size 1408 2816 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_2.log 2>&1 & 18 | CUDA_VISIBLE_DEVICES=2 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 2)) --end-index $(($NUM_START + $NUM_PER_GPU * 3)) --image-size 2816 1408 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_3.log 2>&1 & 19 | CUDA_VISIBLE_DEVICES=3 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 3)) --end-index $(($NUM_START + $NUM_PER_GPU * 4)) --image-size 1664 2304 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_4.log 2>&1 & 20 | CUDA_VISIBLE_DEVICES=4 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 4)) --end-index $(($NUM_START + $NUM_PER_GPU * 5)) --image-size 2304 1664 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_5.log 2>&1 & 21 | CUDA_VISIBLE_DEVICES=5 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 5)) --end-index $(($NUM_START + $NUM_PER_GPU * 6)) --image-size 1536 2560 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_6.log 2>&1 & 22 | CUDA_VISIBLE_DEVICES=6 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 6)) --end-index $(($NUM_START + $NUM_PER_GPU * 7)) --image-size 2560 1536 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_7.log 2>&1 & 23 | CUDA_VISIBLE_DEVICES=7 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 7)) --end-index $(($NUM_START + $NUM_PER_GPU * 8)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_8.log 2>&1 & 24 | -------------------------------------------------------------------------------- /eval/human_eval/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | NUM_FRAMES=$2 5 | MODEL_NAME=$3 6 | 7 | if [[ $CKPT == *"ema"* ]]; then 8 | parentdir=$(dirname $CKPT) 9 | CKPT_BASE=$(basename $parentdir)_ema 10 | else 11 | CKPT_BASE=$(basename $CKPT) 12 | fi 13 | LOG_BASE=$(dirname $CKPT)/eval 14 | mkdir -p ${LOG_BASE} 15 | echo "Logging to $LOG_BASE" 16 | 17 | GPUS=(0 1 2 3 4 5 6 7) 18 | # TASK_ID_LIST=(1 2a 2b 2c 2d 2e 2f 2g) # move image to video task 19 | TASK_ID_LIST=(2a 2b 2c 2d 2e 2f 2g 2h) 20 | # FRAME_LIST=(1 $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES) 21 | 22 | for i in "${!GPUS[@]}"; do 23 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -${TASK_ID_LIST[i]} >${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 24 | done 25 | 26 | # kill all by: pkill -f "inference" 27 | -------------------------------------------------------------------------------- /eval/loss/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py" 4 | CKPT_PATH=$1 5 | MODEL_NAME=$2 6 | IMG_PATH=$3 7 | VID_PATH=$4 8 | 9 | if [ -z $IMG_PATH ]; then 10 | IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv" 11 | fi 12 | 13 | if [ -z $VID_PATH ]; then 14 | VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv" 15 | fi 16 | 17 | if [[ $CKPT_PATH == *"ema"* ]]; then 18 | parentdir=$(dirname $CKPT_PATH) 19 | CKPT_BASE=$(basename $parentdir)_ema 20 | else 21 | CKPT_BASE=$(basename $CKPT_PATH) 22 | fi 23 | LOG_BASE=$(dirname $CKPT_PATH)/eval 24 | mkdir -p $LOG_BASE 25 | echo "Logging to $LOG_BASE" 26 | 27 | 28 | GPUS=(3 4 5 6 7) 29 | RESOLUTION=(144p 240p 360p 480p 720p) 30 | 31 | CUDA_VISIBLE_DEVICES=0 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 0 --end-index 5 >${LOG_BASE}/img_0.log 2>&1 & 32 | CUDA_VISIBLE_DEVICES=1 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 5 --end-index 6 >${LOG_BASE}/img_1.log 2>&1 & 33 | CUDA_VISIBLE_DEVICES=2 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 6 >${LOG_BASE}/img_2.log 2>&1 & 34 | 35 | 36 | for i in "${!GPUS[@]}"; do 37 | CUDA_VISIBLE_DEVICES=${GPUS[i]} $CMD --data-path $VID_PATH --ckpt-path $CKPT_PATH --resolution ${RESOLUTION[i]} >${LOG_BASE}/${RESOLUTION[i]}_vid.log 2>&1 & 38 | done 39 | -------------------------------------------------------------------------------- /eval/loss/tabulate_rl_loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | usage: 3 | python tabulate_rl_loss.py --log_dir /home/zhengzangwei/projs/Open-Sora-dev/logs/loss --ckpt_name epoch0-global_step9000 4 | 5 | save the processed json to: 6 | Open-Sora-dev/evaluation_results/rectified_flow/_loss.json 7 | """ 8 | 9 | import argparse 10 | import json 11 | import os 12 | from ast import literal_eval 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--log_dir", type=str) 18 | args = parser.parse_args() 19 | return args 20 | 21 | 22 | if __name__ == "__main__": 23 | args = parse_args() 24 | 25 | files = os.listdir(args.log_dir) 26 | files = [ 27 | "img_0.log", 28 | "img_1.log", 29 | "img_2.log", 30 | "144p_vid.log", 31 | "240p_vid.log", 32 | "360p_vid.log", 33 | "480p_vid.log", 34 | "720p_vid.log", 35 | ] 36 | 37 | loss_info = {} 38 | 39 | for fname in files: 40 | path = os.path.join(args.log_dir, fname) 41 | with open(path, "r", encoding="utf-8") as f: 42 | content = f.readlines() 43 | eval_line = content[-1].split("losses:")[-1].strip() 44 | loss_dict = literal_eval(eval_line) 45 | for key, loss in loss_dict.items(): 46 | resolution, frame = key 47 | if resolution not in loss_info: 48 | loss_info[resolution] = {} 49 | loss_info[resolution][frame] = format(loss, ".4f") 50 | 51 | # Convert and write JSON object to file 52 | output_file_path = os.path.join(args.log_dir, "loss.json") 53 | with open(output_file_path, "w") as outfile: 54 | json.dump(loss_info, outfile, indent=4, sort_keys=True) 55 | print(f"results saved to: {output_file_path}") 56 | -------------------------------------------------------------------------------- /eval/vae/cal_psnr.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import torch 5 | from tqdm import tqdm 6 | 7 | 8 | def img_psnr(img1, img2): 9 | # [0,1] 10 | # compute mse 11 | # mse = np.mean((img1-img2)**2) 12 | mse = np.mean((img1 / 1.0 - img2 / 1.0) ** 2) 13 | # compute psnr 14 | if mse < 1e-10: 15 | return 100 16 | psnr = 20 * math.log10(1 / math.sqrt(mse)) 17 | return psnr 18 | 19 | 20 | def trans(x): 21 | return x 22 | 23 | 24 | def calculate_psnr(videos1, videos2): 25 | print("calculate_psnr...") 26 | 27 | # videos [batch_size, timestamps, channel, h, w] 28 | 29 | assert videos1.shape == videos2.shape 30 | 31 | videos1 = trans(videos1) 32 | videos2 = trans(videos2) 33 | 34 | psnr_results = [] 35 | 36 | for video_num in tqdm(range(videos1.shape[0])): 37 | # get a video 38 | # video [timestamps, channel, h, w] 39 | video1 = videos1[video_num] 40 | video2 = videos2[video_num] 41 | 42 | psnr_results_of_a_video = [] 43 | for clip_timestamp in range(len(video1)): 44 | # get a img 45 | # img [timestamps[x], channel, h, w] 46 | # img [channel, h, w] numpy 47 | 48 | img1 = video1[clip_timestamp].numpy() 49 | img2 = video2[clip_timestamp].numpy() 50 | 51 | # calculate psnr of a video 52 | psnr_results_of_a_video.append(img_psnr(img1, img2)) 53 | 54 | psnr_results.append(psnr_results_of_a_video) 55 | 56 | psnr_results = np.array(psnr_results) # [batch_size, num_frames] 57 | psnr = {} 58 | psnr_std = {} 59 | 60 | for clip_timestamp in range(len(video1)): 61 | psnr[clip_timestamp] = np.mean(psnr_results[:, clip_timestamp]) 62 | psnr_std[clip_timestamp] = np.std(psnr_results[:, clip_timestamp]) 63 | 64 | result = { 65 | "value": psnr, 66 | "value_std": psnr_std, 67 | "video_setting": video1.shape, 68 | "video_setting_name": "time, channel, heigth, width", 69 | } 70 | 71 | return result 72 | 73 | 74 | # test code / using example 75 | 76 | 77 | def main(): 78 | NUMBER_OF_VIDEOS = 8 79 | VIDEO_LENGTH = 50 80 | CHANNEL = 3 81 | SIZE = 64 82 | videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) 83 | videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) 84 | 85 | import json 86 | 87 | result = calculate_psnr(videos1, videos2) 88 | print(json.dumps(result, indent=4)) 89 | 90 | 91 | if __name__ == "__main__": 92 | main() 93 | -------------------------------------------------------------------------------- /eval/vae/script/eval.sh: -------------------------------------------------------------------------------- 1 | python eval/eval_common_metric.py \ 2 | --batch_size 2 \ 3 | --real_video_dir ../test_eval/release/origin \ 4 | --generated_video_dir ../test_eval/release \ 5 | --device cuda \ 6 | --sample_fps 10 \ 7 | --crop_size 256 \ 8 | --resolution 256 \ 9 | --num_frames 17 \ 10 | --sample_rate 1 \ 11 | --subset_size 100 \ 12 | --metric ssim psnr lpips flolpips 13 | -------------------------------------------------------------------------------- /eval/vbench/calc_vbench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | 5 | import torch 6 | 7 | from vbench import VBench 8 | 9 | full_info_path = "eval/vbench/VBench_full_info.json" 10 | dimensions = [ 11 | # a: 10min 12 | "subject_consistency", # 4min 13 | "imaging_quality", # 6min 14 | # b: 12min 15 | "background_consistency", # 2min 16 | "motion_smoothness", # 5min 17 | "overall_consistency", # 2min 18 | "human_action", # 3min 19 | # c: 14min 20 | "multiple_objects", # 14min 21 | # d: 14min 22 | "spatial_relationship", # 14min 23 | # e: 12min 24 | "object_class", # 12min 25 | # f: 12min 26 | "color", # 12min 27 | # g: 10.5min 28 | "aesthetic_quality", # 2.5min 29 | "appearance_style", # 6min 30 | "temporal_flickering", # 2min 31 | # h: 9min 32 | "scene", # 3min 33 | "temporal_style", # 2min 34 | "dynamic_degree", # 4min 35 | ] 36 | 37 | 38 | def parse_args(): 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("video_folder", type=str) # samples/samples..._vbench/eval 41 | parser.add_argument("model_ckpt", type=str) 42 | parser.add_argument("--start", type=int, default=0) # start index of dimension to be evaluated 43 | parser.add_argument("--end", type=int, default=-1) # start index of dimension to be evaluated 44 | 45 | args = parser.parse_args() 46 | return args 47 | 48 | 49 | if __name__ == "__main__": 50 | args = parse_args() 51 | output_dir = os.path.join(args.model_ckpt, "vbench") 52 | os.makedirs(output_dir, exist_ok=True) 53 | video_path = args.video_folder 54 | 55 | kwargs = {} 56 | kwargs["imaging_quality_preprocessing_mode"] = "longer" # use VBench/evaluate.py default 57 | 58 | start_time = time.time() 59 | 60 | # NOTE: important to use torch.device("cuda"), else will have issue with object_class third_party module 61 | my_VBench = VBench(torch.device("cuda"), full_info_path, output_dir) 62 | if args.end == -1: # adjust end accordingly 63 | args.end = len(dimensions) 64 | for dim in dimensions[args.start : args.end]: 65 | my_VBench.evaluate( 66 | videos_path=video_path, 67 | name=dim, 68 | local=False, 69 | read_frame=False, 70 | dimension_list=[dim], 71 | mode="vbench_standard", 72 | **kwargs, 73 | ) 74 | 75 | print("Runtime: %s seconds " % (time.time() - start_time)) 76 | -------------------------------------------------------------------------------- /eval/vbench/launch.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | CKPT=$1 4 | NUM_FRAMES=$2 5 | MODEL_NAME=$3 6 | RES=$4 7 | ASP_RATIO=$5 8 | 9 | NUM_SAMPLING_STEPS=$6 10 | FLOW=$7 11 | LLM_REFINE=$8 12 | 13 | if [[ $CKPT == *"ema"* ]]; then 14 | parentdir=$(dirname $CKPT) 15 | CKPT_BASE=$(basename $parentdir)_ema 16 | else 17 | CKPT_BASE=$(basename $CKPT) 18 | fi 19 | LOG_BASE=$(dirname $CKPT)/eval 20 | echo "Logging to $LOG_BASE" 21 | 22 | GPUS=(0 1 2 3 4 5 6 7) 23 | TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only 24 | START_INDEX_LIST=(0 120 240 360 480 600 720 840) 25 | END_INDEX_LIST=(120 240 360 480 600 720 840 2000) 26 | 27 | ## Modify the following to run on multiple machines for faster results 28 | ## 720p will take quite long on a single machine 29 | # START_INDEX_LIST=(60 180 300 420 540 660 780 900) 30 | # END_INDEX_LIST=(120 240 360 480 600 720 840 2000) 31 | # LOG_BASE=$(dirname $CKPT)/eval/last_60 32 | # mkdir -p ${LOG_BASE} 33 | # echo "Logging to $LOG_BASE" 34 | 35 | 36 | 37 | for i in "${!GPUS[@]}"; do 38 | if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ; 39 | then 40 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 41 | else 42 | if [ -z ${NUM_SAMPLING_STEPS} ]; 43 | then 44 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 45 | else 46 | if [ -z ${FLOW} ]; 47 | then 48 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 49 | else 50 | if [ -z ${LLM_REFINE} ]; 51 | then 52 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 53 | else 54 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 55 | fi 56 | fi 57 | fi 58 | fi 59 | done 60 | -------------------------------------------------------------------------------- /eval/vbench/launch_calc.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | VIDEO_DIR=$1 4 | CKPT_DIR=$2 5 | LOG_BASE=$CKPT_DIR 6 | mkdir -p $LOG_BASE 7 | echo "Logging to $LOG_BASE" 8 | 9 | GPUS=(0 1 2 3 4 5 6 7) 10 | START_INDEX_LIST=(0 2 6 7 8 9 10 13) 11 | END_INDEX_LIST=(2 6 7 8 9 10 13 16) 12 | TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only 13 | 14 | for i in "${!GPUS[@]}"; do 15 | CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 16 | done 17 | -------------------------------------------------------------------------------- /eval/vbench_i2v/json_to_txt.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | RESOLUTIONS = ["1-1", "16-9", "7-4", "8-5"] 5 | 6 | cache_root = "/mnt/jfs-hdd/sora/data/vbench-i2v/crop" 7 | resolution = RESOLUTIONS[0] 8 | json_file = "vbench2_i2v_full_info.json" 9 | save_path = "all_i2v.txt" 10 | 11 | data = json.load(open(json_file)) 12 | txt = [ 13 | f'{x["prompt_en"]}{{"reference_path": "{os.path.join(cache_root, resolution, x["image_name"])}", "mask_strategy": "0"}}' 14 | for x in data 15 | ] 16 | with open(save_path, "w") as f: 17 | f.write("\n".join(txt)) 18 | -------------------------------------------------------------------------------- /eval/vbench_i2v/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | NUM_FRAMES=$2 5 | MODEL_NAME=$3 6 | RES=$4 7 | ASP_RATIO=$5 8 | 9 | NUM_SAMPLING_STEPS=$6 10 | FLOW=$7 11 | LLM_REFINE=$8 12 | 13 | if [[ $CKPT == *"ema"* ]]; then 14 | parentdir=$(dirname $CKPT) 15 | CKPT_BASE=$(basename $parentdir)_ema 16 | else 17 | CKPT_BASE=$(basename $CKPT) 18 | fi 19 | LOG_BASE=$(dirname $CKPT)/eval 20 | echo "Logging to $LOG_BASE" 21 | 22 | GPUS=(0 1 2 3 4 5 6 7) 23 | TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only 24 | START_INDEX_LIST=(0 140 280 420 560 700 840 980) 25 | END_INDEX_LIST=(140 280 420 560 700 840 980 2000) 26 | 27 | 28 | for i in "${!GPUS[@]}"; do 29 | if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ; 30 | then 31 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 32 | else 33 | if [ -z ${NUM_SAMPLING_STEPS} ]; 34 | then 35 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 36 | else 37 | if [ -z ${FLOW} ]; 38 | then 39 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 40 | else 41 | if [ -z ${LLM_REFINE} ]; 42 | then 43 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 44 | else 45 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 46 | fi 47 | fi 48 | fi 49 | fi 50 | done 51 | -------------------------------------------------------------------------------- /eval/vbench_i2v/launch_calc.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | VIDEO_DIR=$1 4 | CKPT_DIR=$2 5 | LOG_BASE=$CKPT_DIR 6 | mkdir -p $LOG_BASE 7 | echo "Logging to $LOG_BASE" 8 | 9 | GPUS=(0 1 2 3 4 5 6 7) 10 | CALC_I2V_LIST=(True True False False False False False False) 11 | CALC_QUALITY_LIST=(False False True True True True True True) 12 | START_INDEX_LIST=(0 2 0 2 3 4 5 6) 13 | END_INDEX_LIST=(2 -1 2 3 4 5 6 -1) 14 | TASK_ID_LIST=(calc_vbench_i2v_a calc_vbench_i2v_b calc_vbench_i2v_c calc_vbench_i2v_d calc_vbench_i2v_e calc_vbench_i2v_f calc_vbench_i2v_g calc_vbench_i2v_h) # for log records only 15 | 16 | 17 | for i in "${!GPUS[@]}"; do 18 | CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench_i2v/calc_vbench_i2v.py $VIDEO_DIR $CKPT_DIR --calc_i2v ${CALC_I2V_LIST[i]} --calc_quality ${CALC_QUALITY_LIST[i]} --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 19 | done 20 | -------------------------------------------------------------------------------- /gradio/requirements.txt: -------------------------------------------------------------------------------- 1 | xformers 2 | transformers 3 | git+https://github.com/hpcaitech/Open-Sora.git 4 | -------------------------------------------------------------------------------- /kill_process.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the input file is provided 4 | if [ $# -ne 1 ]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | 10 | input_file="$1" 11 | 12 | # Check if the input file exists 13 | if [ ! -f "$input_file" ]; then 14 | echo "Error: Input file '$input_file' does not exist." 15 | exit 1 16 | fi 17 | 18 | while IFS= read -r hostname || [ -n "$hostname" ]; do 19 | if [ -n "$hostname" ]; then 20 | echo "Sending 'sudo pkill -f python.*train\.py' to $hostname" 21 | ssh "$hostname" "sudo pkill -f python.*train\.py" & 22 | fi 23 | done < "$input_file" 24 | -------------------------------------------------------------------------------- /nvtop_all.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import subprocess 3 | import pandas as pd 4 | import sys 5 | 6 | def get_gpu_info(node_name): 7 | try: 8 | result_processes = subprocess.run( 9 | ["ssh", node_name, "nvidia-smi --query-compute-apps=pid --format=csv,noheader | wc -l"], 10 | capture_output=True, 11 | text=True, 12 | check=True 13 | ) 14 | num_processes = int(result_processes.stdout.strip()) 15 | 16 | result_power = subprocess.run( 17 | ["ssh", node_name, "nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits"], 18 | capture_output=True, 19 | text=True, 20 | check=True 21 | ) 22 | power_draws = [float(p.strip()) for p in result_power.stdout.splitlines()] 23 | mean_power = sum(power_draws) / len(power_draws) if power_draws else 0.0 24 | 25 | return node_name, num_processes, mean_power 26 | except subprocess.CalledProcessError as e: 27 | return node_name, "Failed", "Failed" 28 | 29 | def main(hostfile): 30 | with open(hostfile, 'r') as file: 31 | nodes = [line.strip() for line in file if line.strip()] 32 | 33 | with concurrent.futures.ThreadPoolExecutor() as executor: 34 | futures = [executor.submit(get_gpu_info, node) for node in nodes] 35 | results = [future.result() for future in concurrent.futures.as_completed(futures)] 36 | 37 | df = pd.DataFrame(results, columns=["Node", "GPU Processes", "Mean Power Consumption (W)"]) 38 | 39 | # Calculate mean values for GPU Processes and Mean Power Consumption 40 | mean_gpu_processes = df["GPU Processes"].replace("Failed", float('nan')).astype(float).mean() 41 | mean_power_consumption = df["Mean Power Consumption (W)"].replace("Failed", float('nan')).astype(float).mean() 42 | #df.loc["Mean"] = ["", mean_gpu_processes, mean_power_consumption] 43 | 44 | # Set pandas options to display the entire DataFrame 45 | pd.set_option('display.max_rows', None) 46 | pd.set_option('display.max_columns', None) 47 | pd.set_option('display.width', None) 48 | pd.set_option('display.max_colwidth', None) 49 | 50 | print(df) 51 | 52 | if __name__ == "__main__": 53 | if len(sys.argv) != 2: 54 | print("Usage: python script.py ") 55 | sys.exit(1) 56 | 57 | hostfile = sys.argv[1] 58 | main(hostfile) 59 | 60 | -------------------------------------------------------------------------------- /opensora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/__init__.py -------------------------------------------------------------------------------- /opensora/acceleration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/acceleration/__init__.py -------------------------------------------------------------------------------- /opensora/acceleration/checkpoint.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable 2 | 3 | import torch.nn as nn 4 | from torch.utils.checkpoint import checkpoint, checkpoint_sequential 5 | 6 | 7 | def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1): 8 | assert isinstance(model, nn.Module) 9 | 10 | def set_attr(module): 11 | module.grad_checkpointing = True 12 | module.fp32_attention = use_fp32_attention 13 | module.grad_checkpointing_step = gc_step 14 | 15 | model.apply(set_attr) 16 | 17 | 18 | def auto_grad_checkpoint(module, *args, **kwargs): 19 | if getattr(module, "grad_checkpointing", False): 20 | if not isinstance(module, Iterable): 21 | return checkpoint(module, *args, use_reentrant=False, **kwargs) 22 | gc_step = module[0].grad_checkpointing_step 23 | return checkpoint_sequential(module, gc_step, *args, use_reentrant=False, **kwargs) 24 | return module(*args, **kwargs) 25 | -------------------------------------------------------------------------------- /opensora/acceleration/parallel_states.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as dist 2 | 3 | _GLOBAL_PARALLEL_GROUPS = dict() 4 | 5 | 6 | def set_data_parallel_group(group: dist.ProcessGroup): 7 | _GLOBAL_PARALLEL_GROUPS["data"] = group 8 | 9 | 10 | def get_data_parallel_group(): 11 | return _GLOBAL_PARALLEL_GROUPS.get("data", dist.group.WORLD) 12 | 13 | 14 | def set_sequence_parallel_group(group: dist.ProcessGroup): 15 | _GLOBAL_PARALLEL_GROUPS["sequence"] = group 16 | 17 | 18 | def get_sequence_parallel_group(): 19 | return _GLOBAL_PARALLEL_GROUPS.get("sequence", None) 20 | -------------------------------------------------------------------------------- /opensora/acceleration/shardformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/acceleration/shardformer/__init__.py -------------------------------------------------------------------------------- /opensora/acceleration/shardformer/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/acceleration/shardformer/modeling/__init__.py -------------------------------------------------------------------------------- /opensora/acceleration/shardformer/modeling/t5.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class T5LayerNorm(nn.Module): 6 | def __init__(self, hidden_size, eps=1e-6): 7 | """ 8 | Construct a layernorm module in the T5 style. No bias and no subtraction of mean. 9 | """ 10 | super().__init__() 11 | self.weight = nn.Parameter(torch.ones(hidden_size)) 12 | self.variance_epsilon = eps 13 | 14 | def forward(self, hidden_states): 15 | # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean 16 | # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated 17 | # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for 18 | # half-precision inputs is done in fp32 19 | 20 | variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) 21 | hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) 22 | 23 | # convert into half-precision if necessary 24 | if self.weight.dtype in [torch.float16, torch.bfloat16]: 25 | hidden_states = hidden_states.to(self.weight.dtype) 26 | 27 | return self.weight * hidden_states 28 | 29 | @staticmethod 30 | def from_native_module(module, *args, **kwargs): 31 | assert module.__class__.__name__ == "FusedRMSNorm", ( 32 | "Recovering T5LayerNorm requires the original layer to be apex's Fused RMS Norm." 33 | "Apex's fused norm is automatically used by Hugging Face Transformers https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L265C5-L265C48" 34 | ) 35 | 36 | layer_norm = T5LayerNorm(module.normalized_shape, eps=module.eps) 37 | layer_norm.weight.data.copy_(module.weight.data) 38 | layer_norm = layer_norm.to(module.weight.device) 39 | return layer_norm 40 | -------------------------------------------------------------------------------- /opensora/acceleration/shardformer/policy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/acceleration/shardformer/policy/__init__.py -------------------------------------------------------------------------------- /opensora/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import IMG_FPS, BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset 2 | from .utils import get_transforms_image, get_transforms_video, is_img, is_vid, save_sample 3 | -------------------------------------------------------------------------------- /opensora/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .dit import * 2 | from .latte import * 3 | from .pixart import * 4 | from .stdit import * 5 | from .text_encoder import * 6 | from .vae import * 7 | -------------------------------------------------------------------------------- /opensora/models/dit/__init__.py: -------------------------------------------------------------------------------- 1 | from .dit import DiT, DiT_XL_2, DiT_XL_2x2 2 | -------------------------------------------------------------------------------- /opensora/models/latte/__init__.py: -------------------------------------------------------------------------------- 1 | from .latte import Latte, Latte_XL_2, Latte_XL_2x2 2 | -------------------------------------------------------------------------------- /opensora/models/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/models/layers/__init__.py -------------------------------------------------------------------------------- /opensora/models/pixart/__init__.py: -------------------------------------------------------------------------------- 1 | from .pixart import PixArt, PixArt_1B_2, PixArt_XL_2 2 | from .pixart_sigma import PixArt_Sigma_XL_2 3 | -------------------------------------------------------------------------------- /opensora/models/stdit/__init__.py: -------------------------------------------------------------------------------- 1 | from .stdit import STDiT 2 | from .stdit2 import STDiT2 3 | from .stdit3 import STDiT3 4 | -------------------------------------------------------------------------------- /opensora/models/text_encoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .classes import ClassEncoder 2 | from .clip import ClipEncoder 3 | from .t5 import T5Encoder 4 | -------------------------------------------------------------------------------- /opensora/models/text_encoder/classes.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from opensora.registry import MODELS 4 | 5 | 6 | @MODELS.register_module("classes") 7 | class ClassEncoder: 8 | def __init__(self, num_classes, model_max_length=None, device="cuda", dtype=torch.float): 9 | self.num_classes = num_classes 10 | self.y_embedder = None 11 | 12 | self.model_max_length = model_max_length 13 | self.output_dim = None 14 | self.device = device 15 | 16 | def encode(self, text): 17 | return dict(y=torch.tensor([int(t) for t in text]).to(self.device)) 18 | 19 | def null(self, n): 20 | return torch.tensor([self.num_classes] * n).to(self.device) 21 | -------------------------------------------------------------------------------- /opensora/models/vae/__init__.py: -------------------------------------------------------------------------------- 1 | from .discriminator import DISCRIMINATOR_3D 2 | from .vae import VideoAutoencoderKL, VideoAutoencoderKLTemporalDecoder 3 | from .vae_temporal import VAE_Temporal 4 | -------------------------------------------------------------------------------- /opensora/models/vae/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | """Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models""" 5 | 6 | 7 | class DiagonalGaussianDistribution(object): 8 | def __init__( 9 | self, 10 | parameters, 11 | deterministic=False, 12 | ): 13 | self.parameters = parameters 14 | self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) 15 | self.logvar = torch.clamp(self.logvar, -30.0, 20.0) 16 | self.deterministic = deterministic 17 | self.std = torch.exp(0.5 * self.logvar) 18 | self.var = torch.exp(self.logvar) 19 | if self.deterministic: 20 | self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device, dtype=self.mean.dtype) 21 | 22 | def sample(self): 23 | # torch.randn: standard normal distribution 24 | x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device, dtype=self.mean.dtype) 25 | return x 26 | 27 | def kl(self, other=None): 28 | if self.deterministic: 29 | return torch.Tensor([0.0]) 30 | else: 31 | if other is None: # SCH: assumes other is a standard normal distribution 32 | return 0.5 * torch.sum(torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, dim=[1, 2, 3, 4]) 33 | else: 34 | return 0.5 * torch.sum( 35 | torch.pow(self.mean - other.mean, 2) / other.var 36 | + self.var / other.var 37 | - 1.0 38 | - self.logvar 39 | + other.logvar, 40 | dim=[1, 2, 3, 4], 41 | ) 42 | 43 | def nll(self, sample, dims=[1, 2, 3, 4]): 44 | if self.deterministic: 45 | return torch.Tensor([0.0]) 46 | logtwopi = np.log(2.0 * np.pi) 47 | return 0.5 * torch.sum(logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, dim=dims) 48 | 49 | def mode(self): 50 | return self.mean 51 | -------------------------------------------------------------------------------- /opensora/registry.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import torch.nn as nn 4 | from mmengine.registry import Registry 5 | 6 | 7 | def build_module(module, builder, **kwargs): 8 | """Build module from config or return the module itself. 9 | 10 | Args: 11 | module (Union[dict, nn.Module]): The module to build. 12 | builder (Registry): The registry to build module. 13 | *args, **kwargs: Arguments passed to build function. 14 | 15 | Returns: 16 | Any: The built module. 17 | """ 18 | if module is None: 19 | return None 20 | if isinstance(module, dict): 21 | cfg = deepcopy(module) 22 | for k, v in kwargs.items(): 23 | cfg[k] = v 24 | return builder.build(cfg) 25 | elif isinstance(module, nn.Module): 26 | return module 27 | elif module is None: 28 | return None 29 | else: 30 | raise TypeError(f"Only support dict and nn.Module, but got {type(module)}.") 31 | 32 | 33 | MODELS = Registry( 34 | "model", 35 | locations=["opensora.models"], 36 | ) 37 | 38 | SCHEDULERS = Registry( 39 | "scheduler", 40 | locations=["opensora.schedulers"], 41 | ) 42 | 43 | DATASETS = Registry( 44 | "dataset", 45 | locations=["opensora.datasets"], 46 | ) 47 | -------------------------------------------------------------------------------- /opensora/schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | from .dpms import DPMS 2 | from .iddpm import IDDPM 3 | from .rf import RFLOW 4 | -------------------------------------------------------------------------------- /opensora/schedulers/dpms/__init__.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch 4 | 5 | from opensora.registry import SCHEDULERS 6 | 7 | from .dpm_solver import DPMS 8 | 9 | 10 | @SCHEDULERS.register_module("dpm-solver") 11 | class DPM_SOLVER: 12 | def __init__(self, num_sampling_steps=None, cfg_scale=4.0): 13 | self.num_sampling_steps = num_sampling_steps 14 | self.cfg_scale = cfg_scale 15 | 16 | def sample( 17 | self, 18 | model, 19 | text_encoder, 20 | z, 21 | prompts, 22 | device, 23 | additional_args=None, 24 | mask=None, 25 | progress=True, 26 | ): 27 | if mask is not None: 28 | print("[WARNING] mask is not supported in dpm-solver, it will be ignored") 29 | n = len(prompts) 30 | model_args = text_encoder.encode(prompts) 31 | y = model_args.pop("y") 32 | null_y = text_encoder.null(n) 33 | if additional_args is not None: 34 | model_args.update(additional_args) 35 | 36 | dpms = DPMS( 37 | partial(forward_with_dpmsolver, model), 38 | condition=y, 39 | uncondition=null_y, 40 | cfg_scale=self.cfg_scale, 41 | model_kwargs=model_args, 42 | ) 43 | samples = dpms.sample( 44 | z, 45 | steps=self.num_sampling_steps, 46 | order=2, 47 | skip_type="time_uniform", 48 | method="multistep", 49 | progress=progress, 50 | ) 51 | return samples 52 | 53 | 54 | def forward_with_dpmsolver(self, x, timestep, y, **kwargs): 55 | """ 56 | dpm solver donnot need variance prediction 57 | """ 58 | # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb 59 | model_out = self.forward(x, timestep, y, **kwargs) 60 | return model_out.chunk(2, dim=1)[0] 61 | -------------------------------------------------------------------------------- /opensora/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/opensora/utils/__init__.py -------------------------------------------------------------------------------- /requirements/requirements-cu121.txt: -------------------------------------------------------------------------------- 1 | torch==2.2.2 --index-url https://download.pytorch.org/whl/cu121 2 | torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu121 3 | xformers==0.0.25.post1 --index-url https://download.pytorch.org/whl/cu121 4 | -------------------------------------------------------------------------------- /requirements/requirements-data.txt: -------------------------------------------------------------------------------- 1 | gdown>=5.2.0 2 | 3 | # [caption llava] 4 | ninja>=1.11.1.1 5 | shortuuid>=1.0.13 6 | markdown2[all] 7 | scikit-learn>=1.4.2 8 | einops-exts>=0.0.4 9 | 10 | # [camera_motion] 11 | decord==0.6.0 12 | ptvsd==4.3.2 13 | imageio-ffmpeg>=0.4.9 14 | 15 | # [datasets] 16 | ffmpeg-python==0.2.0 17 | lingua-language-detector==2.0.2 18 | 19 | # [frame interpolation] 20 | imageio>=2.34.1 21 | 22 | # [aesthetic] 23 | setuptools==68.2.2 24 | clip @ git+https://github.com/openai/CLIP.git 25 | 26 | # [ocr] 27 | mmcv==2.1.0 28 | mmdet==3.1.0 29 | mmocr==1.0.1 30 | detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992 31 | -------------------------------------------------------------------------------- /requirements/requirements-eval.txt: -------------------------------------------------------------------------------- 1 | # [vbench] 2 | detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992 3 | imageio>=2.34.1 4 | pyiqa==0.1.10 5 | scikit-learn>=1.4.2 6 | scikit-image>=0.20.0 7 | lvis==0.5.3 8 | boto3>=1.34.113 9 | easydict>=1.9 10 | fairscale>=0.4.13 11 | 12 | # [vae] 13 | decord==0.6.0 14 | pytorchvideo==0.1.5 15 | lpips==0.1.4 16 | -------------------------------------------------------------------------------- /requirements/requirements-vae.txt: -------------------------------------------------------------------------------- 1 | beartype==0.18.5 2 | einops==0.8.0 3 | einops-exts==0.0.4 4 | opencv-python==4.9.0.80 5 | pillow==10.3.0 6 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai>=0.4.0 2 | mmengine>=0.10.3 3 | pandas>=2.0.3 4 | timm==0.9.16 5 | rotary_embedding_torch==0.5.3 6 | ftfy>=6.2.0 # for t5 7 | diffusers==0.27.2 # for vae 8 | accelerate==0.29.2 # for t5 9 | av>=12.0.0 # for video loading 10 | numpy<2.0.0 11 | 12 | # [gradio] 13 | gradio>=4.26.0 14 | spaces>=0.28.3 15 | 16 | # [notebook] 17 | ipykernel>=6.29.4 18 | ipywidgets>=8.1.2 19 | 20 | # [training] 21 | wandb>=0.17.0 22 | tensorboard>=2.14.0 23 | pandarallel>=1.6.5 24 | pyarrow>=16.1.0 # for parquet 25 | 26 | # [dev] 27 | pre-commit>=3.5.0 28 | openai 29 | -------------------------------------------------------------------------------- /scripts/clear_cache.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the input file is provided 4 | if [ $# -ne 1 ]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | 10 | input_file="$1" 11 | 12 | # Check if the input file exists 13 | if [ ! -f "$input_file" ]; then 14 | echo "Error: Input file '$input_file' does not exist." 15 | exit 1 16 | fi 17 | 18 | while IFS= read -r hostname || [ -n "$hostname" ]; do 19 | if [ -n "$hostname" ]; then 20 | ssh "$hostname" "rm -rf /home/ubuntu/.cache/colossalai/" & 21 | fi 22 | done < "$input_file" 23 | 24 | -------------------------------------------------------------------------------- /scripts/misc/launch_extract_feat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | START_SPLIT=0 7 | NUM_SPLIT=10 8 | 9 | DATA_PATH=$1 10 | SAVE_PATH=$2 11 | DATA_ARG="--data-path $DATA_PATH" 12 | SAVE_ARG="--save-dir $SAVE_PATH" 13 | 14 | CMD="torchrun --standalone --nproc_per_node 1 scripts/misc/extract_feat.py configs/opensora-v1-2/misc/extract.py $DATA_ARG $SAVE_ARG" 15 | declare -a GPUS=(0 1 2 3 4 5 6 7) 16 | 17 | mkdir -p logs/extract_feat 18 | 19 | for i in "${GPUS[@]}"; do 20 | CUDA_VISIBLE_DEVICES=$i $CMD --start-index $(($START_SPLIT + i * $NUM_SPLIT)) --end-index $(($START_SPLIT + (i + 1) * $NUM_SPLIT)) >logs/extract_feat/$i.log 2>&1 & 21 | done 22 | -------------------------------------------------------------------------------- /scripts/misc/launch_search_bs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | CMD="torchrun --standalone --nproc_per_node 1 scripts/misc/search_bs.py configs/opensora-v1-2/misc/bs.py" 7 | DATA_PATH="/mnt/nfs-207/sora_data/meta/searchbs.csv" 8 | 9 | LOG_BASE=logs/search_bs 10 | mkdir -p logs/search_bs 11 | echo "Logging to $LOG_BASE" 12 | 13 | CUDA_VISIBLE_DEVICES=0 $CMD --data-path $DATA_PATH --resolution 144p >${LOG_BASE}/144p.log 2>&1 & 14 | CUDA_VISIBLE_DEVICES=1 $CMD --data-path $DATA_PATH --resolution 240p >${LOG_BASE}/240p.log 2>&1 & 15 | CUDA_VISIBLE_DEVICES=2 $CMD --data-path $DATA_PATH --resolution 512 >${LOG_BASE}/512.log 2>&1 & 16 | CUDA_VISIBLE_DEVICES=3 $CMD --data-path $DATA_PATH --resolution 480p >${LOG_BASE}/480p.log 2>&1 & 17 | CUDA_VISIBLE_DEVICES=4 $CMD --data-path $DATA_PATH --resolution 1024 >${LOG_BASE}/1024.log 2>&1 & 18 | CUDA_VISIBLE_DEVICES=5 $CMD --data-path $DATA_PATH --resolution 1080p >${LOG_BASE}/1080p.log 2>&1 & 19 | CUDA_VISIBLE_DEVICES=6 $CMD --data-path $DATA_PATH --resolution 2048 >${LOG_BASE}/2048.log 2>&1 & 20 | -------------------------------------------------------------------------------- /tests/test_attn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from colossalai.accelerator import get_accelerator 3 | from colossalai.utils import get_current_device 4 | from rotary_embedding_torch import RotaryEmbedding 5 | 6 | from opensora.models.layers.blocks import Attention 7 | 8 | # B, S, H = 7488, 1, 1152 9 | # B, S, H = 32, 234, 1152 10 | B, S, H = 128, 32, 1152 11 | N, D = 16, 72 12 | 13 | 14 | def run_attn(enable_flash_attn: bool): 15 | get_accelerator().reset_peak_memory_stats() 16 | rope = RotaryEmbedding(D).to(device=get_current_device(), dtype=torch.bfloat16) 17 | attn = Attention( 18 | H, 19 | N, 20 | qkv_bias=True, 21 | rope=rope.rotate_queries_or_keys, 22 | enable_flash_attn=enable_flash_attn, 23 | ).to(device=get_current_device(), dtype=torch.bfloat16) 24 | x = torch.randn(B, S, H, device=get_current_device(), dtype=torch.bfloat16).requires_grad_() 25 | y = attn(x) 26 | y.mean().backward() 27 | print(f"Peak memory: {get_accelerator().max_memory_allocated() / 1024**2:.2f} MB") 28 | 29 | 30 | if __name__ == "__main__": 31 | print("Use flashattn") 32 | run_attn(True) 33 | print("No flashattn") 34 | run_attn(False) 35 | -------------------------------------------------------------------------------- /tests/test_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | from torchvision.models import resnet50 4 | from tqdm import tqdm 5 | 6 | from opensora.utils.lr_scheduler import LinearWarmupLR 7 | 8 | 9 | def test_lr_scheduler(): 10 | warmup_steps = 200 11 | model = resnet50().cuda() 12 | optimizer = Adam(model.parameters(), lr=0.01) 13 | scheduler = LinearWarmupLR(optimizer, warmup_steps=warmup_steps) 14 | current_lr = scheduler.get_lr()[0] 15 | data = torch.rand(1, 3, 224, 224).cuda() 16 | 17 | for i in tqdm(range(warmup_steps * 2)): 18 | out = model(data) 19 | out.mean().backward() 20 | optimizer.step() 21 | scheduler.step() 22 | 23 | if i >= warmup_steps: 24 | assert scheduler.get_lr()[0] == 0.01 25 | else: 26 | assert scheduler.get_lr()[0] > current_lr, f"{scheduler.get_lr()[0]} <= {current_lr}" 27 | current_lr = scheduler.get_lr()[0] 28 | 29 | 30 | if __name__ == "__main__": 31 | test_lr_scheduler() 32 | -------------------------------------------------------------------------------- /tests/test_pos_emb.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from opensora.models.layers.blocks import PositionEmbedding2D, get_2d_sincos_pos_embed 5 | 6 | D = 8 7 | SCALE = 2.0 8 | from torch.testing import assert_close 9 | 10 | 11 | def get_spatial_pos_embed(x, hidden_size, h, w, scale, base_size=None): 12 | pos_embed = get_2d_sincos_pos_embed( 13 | hidden_size, 14 | (h, w), 15 | scale=scale, 16 | base_size=base_size, 17 | ) 18 | pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) 19 | return pos_embed.to(device=x.device, dtype=x.dtype) 20 | 21 | 22 | @pytest.mark.parametrize("dtype", [torch.float, torch.float16]) 23 | @pytest.mark.parametrize("device", ["cpu", "cuda"]) 24 | def test_pos_emb(dtype, device): 25 | # just a placeholder to get the device and dtype 26 | x = torch.empty(1, dtype=dtype, device=device) 27 | pos_embedder = PositionEmbedding2D( 28 | D, 29 | max_position_embeddings=8, 30 | scale=SCALE, 31 | ).to(device=device, dtype=dtype) 32 | output = pos_embedder(x, 8, 7) 33 | target = get_spatial_pos_embed(x, D, 8, 7, SCALE) 34 | assert_close(output, target) 35 | output = pos_embedder(x, 15, 16) 36 | target = get_spatial_pos_embed(x, D, 15, 16, SCALE) 37 | assert_close(output, target) 38 | output = pos_embedder(x, 30, 20, base_size=2) 39 | target = get_spatial_pos_embed(x, D, 30, 20, SCALE, base_size=2) 40 | assert_close(output, target) 41 | # test cache 42 | output = pos_embedder(x, 30, 20, base_size=2) 43 | target = get_spatial_pos_embed(x, D, 30, 20, SCALE, base_size=2) 44 | assert_close(output, target) 45 | assert pos_embedder._get_cached_emb.cache_info().hits >= 1 46 | -------------------------------------------------------------------------------- /tests/test_t5_shardformer.py: -------------------------------------------------------------------------------- 1 | import time 2 | from copy import deepcopy 3 | 4 | import colossalai 5 | import torch 6 | from colossalai.shardformer import ShardConfig, ShardFormer 7 | from colossalai.testing import spawn 8 | 9 | from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy 10 | from opensora.models.text_encoder.t5 import T5Embedder 11 | 12 | 13 | def run_t5_encoder(rank, world_size, port): 14 | colossalai.launch({}, rank=rank, world_size=world_size, port=port, host="localhost") 15 | 16 | # t5 embedder 17 | t5_path = "./pretrained_models/t5_ckpts" 18 | hf_t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=t5_path, torch_dtype=torch.float) 19 | sf_t5 = deepcopy(hf_t5) 20 | 21 | # create huggingface model as normal 22 | shard_config = ShardConfig( 23 | tensor_parallel_process_group=None, 24 | pipeline_stage_manager=None, 25 | enable_tensor_parallelism=False, 26 | enable_fused_normalization=False, 27 | enable_flash_attention=False, 28 | enable_jit_fused=True, 29 | enable_sequence_parallelism=False, 30 | enable_sequence_overlap=False, 31 | ) 32 | shard_former = ShardFormer(shard_config=shard_config) 33 | sharded_model, _ = shard_former.optimize(sf_t5.model, policy=T5EncoderPolicy()) 34 | sf_t5.model = sharded_model 35 | 36 | # test t5 embedder 37 | texts = ["Who is the best player in the history of NBA?", "How to study computer science?"] 38 | for i in range(5): 39 | hf_embs, hf_masks = hf_t5.get_text_embeddings(texts) 40 | sf_embs, sf_masks = sf_t5.get_text_embeddings(texts) 41 | 42 | # check accuracy 43 | assert torch.allclose(hf_embs, sf_embs, rtol=1e-4, atol=1e-5), f"{hf_embs} \nvs\n{sf_embs}" 44 | assert torch.allclose(hf_masks, sf_masks), f"{hf_masks} \nvs\n{sf_masks}" 45 | 46 | # measure perf 47 | torch.cuda.synchronize() 48 | hf_start = time.time() 49 | for i in range(20): 50 | hf_embs, hf_masks = hf_t5.get_text_embeddings(texts) 51 | torch.cuda.synchronize() 52 | hf_end = time.time() 53 | 54 | # convert sf to fp16 55 | hf_t5.model = hf_t5.model.half() 56 | torch.cuda.synchronize() 57 | sf_start = time.time() 58 | for i in range(20): 59 | hf_embs, hf_masks = hf_t5.get_text_embeddings(texts) 60 | torch.cuda.synchronize() 61 | sf_end = time.time() 62 | 63 | print(f"[Performance] native: {hf_end - hf_start}s, shardformer: {sf_end - sf_start} s") 64 | 65 | 66 | def test_t5_encoder(): 67 | spawn(run_t5_encoder) 68 | 69 | 70 | if __name__ == "__main__": 71 | test_t5_encoder() 72 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/__init__.py -------------------------------------------------------------------------------- /tools/caption/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/caption/__init__.py -------------------------------------------------------------------------------- /tools/caption/acceleration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/caption/acceleration/__init__.py -------------------------------------------------------------------------------- /tools/caption/acceleration/llava/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/caption/acceleration/llava/__init__.py -------------------------------------------------------------------------------- /tools/caption/acceleration/llava/policies/__init__.py: -------------------------------------------------------------------------------- 1 | from .llama import LlavaLlamaForCausalLMPolicy 2 | from .mistral import LlavaMistralForCausalLMPolicy 3 | -------------------------------------------------------------------------------- /tools/caption/camera_motion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/caption/camera_motion/__init__.py -------------------------------------------------------------------------------- /tools/caption/camera_motion/detect.py: -------------------------------------------------------------------------------- 1 | # Originally developed by https://github.com/Vchitect/VBench based on https://github.com/facebookresearch/co-tracker. 2 | 3 | import argparse 4 | from typing import List 5 | 6 | import pandas as pd 7 | 8 | from .camera_motion import compute_camera_motion 9 | 10 | 11 | def process(paths: List[str], threshold: float) -> List[str]: 12 | device = "cuda" 13 | submodules = {"repo": "facebookresearch/co-tracker", "model": "cotracker2"} 14 | camera_motion_types = compute_camera_motion(device, submodules, paths, factor=threshold) 15 | return camera_motion_types 16 | 17 | 18 | def main(args): 19 | output_file = args.input.replace(".csv", "_cmotion.csv") 20 | data = pd.read_csv(args.input) 21 | data["cmotion"] = process(data["path"], args.threshold) 22 | data.to_csv(output_file, index=False) 23 | print(f"Output saved to {output_file}") 24 | 25 | 26 | if __name__ == "__main__": 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("input", type=str) 29 | parser.add_argument("--threshold", type=float, default=0.25) 30 | args = parser.parse_args() 31 | main(args) 32 | -------------------------------------------------------------------------------- /tools/caption/camera_motion/requirements.txt: -------------------------------------------------------------------------------- 1 | decord 2 | ptvsd 3 | imageio-ffmpeg 4 | -------------------------------------------------------------------------------- /tools/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/datasets/__init__.py -------------------------------------------------------------------------------- /tools/datasets/ffmpeg_check_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | parallel --progress -j+0 'ffmpeg -v error -i {} -f null - 2>{}.err' < $1 3 | -------------------------------------------------------------------------------- /tools/datasets/ffmpeg_filter_without_errors.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from concurrent.futures import ProcessPoolExecutor, as_completed 4 | import multiprocessing 5 | from tqdm import tqdm 6 | 7 | def check_error_file(row): 8 | # Construct the path to the .err file 9 | err_file = f"{row['path']}.err" 10 | 11 | # Check if the .err file exists and is empty 12 | if not os.path.exists(err_file): 13 | return row 14 | elif os.path.exists(err_file) and os.path.getsize(err_file) == 0: 15 | return row 16 | return None 17 | 18 | def filter_csv(input_csv): 19 | # Read the CSV into a pandas DataFrame 20 | df = pd.read_csv(input_csv) 21 | 22 | # Initialize tqdm progress bar 23 | progress_bar = tqdm(total=len(df), desc="Processing files") 24 | 25 | # Use ProcessPoolExecutor for parallel processing 26 | with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor: 27 | # Submit tasks for parallel execution 28 | futures = {executor.submit(check_error_file, row): index for index, row in df.iterrows()} 29 | 30 | # Collect results as they complete 31 | filtered_rows = [] 32 | for future in as_completed(futures): 33 | result = future.result() 34 | if result is not None: 35 | filtered_rows.append(result) 36 | progress_bar.update(1) 37 | 38 | # Close the progress bar 39 | progress_bar.close() 40 | 41 | # Create a new DataFrame from the filtered rows 42 | filtered_df = pd.DataFrame(filtered_rows, columns=df.columns) 43 | 44 | # Generate the output file name 45 | output_csv = input_csv.replace('.csv', '_withouterror.csv') 46 | 47 | # Write the filtered DataFrame to a new CSV file 48 | filtered_df.to_csv(output_csv, index=False) 49 | print(f"Filtered CSV saved as: {output_csv}") 50 | 51 | if __name__ == "__main__": 52 | # Replace 'yourfile.csv' with the actual CSV file you want to process 53 | import sys 54 | input_csv = sys.argv[-1] 55 | filter_csv(input_csv) 56 | 57 | -------------------------------------------------------------------------------- /tools/datasets/filter_large_videos.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import pandas as pd 4 | from tqdm import tqdm 5 | from concurrent.futures import ThreadPoolExecutor 6 | 7 | # Function to get the size of a file 8 | def get_file_size(path): 9 | return os.path.getsize(path) if os.path.isfile(path) else None 10 | 11 | # Load the CSV file 12 | file_path = sys.argv[-2] 13 | size = int(sys.argv[-1]) 14 | assert len(sys.argv) == 3 15 | df = pd.read_csv(file_path) 16 | 17 | # Enable tqdm to monitor progress 18 | tqdm.pandas() 19 | 20 | # Use ThreadPoolExecutor for parallel processing 21 | with ThreadPoolExecutor(max_workers=4) as executor: # Adjust max_workers based on your CPU cores 22 | df['file_size'] = list(tqdm(executor.map(get_file_size, df['path']), total=len(df))) 23 | 24 | # Convert 50 MB to bytes 25 | size_threshold = size * 1024 * 1024 # 50 MB in bytes 26 | 27 | # Drop rows where file size is 50 MB or more 28 | df_filtered = df[df['file_size'] < size_threshold] 29 | 30 | # Save the filtered DataFrame back to the original CSV file 31 | file_path = file_path.replace(".csv", f"_le{size}M.csv") 32 | df_filtered.to_csv(file_path.replace(".csv", f"_le{size}M.csv"), index=False) 33 | print(f"Saved filtered data to {file_path}.") 34 | 35 | -------------------------------------------------------------------------------- /tools/datasets/split.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List 3 | 4 | import pandas as pd 5 | from mmengine.config import Config 6 | 7 | from opensora.datasets.bucket import Bucket 8 | 9 | 10 | def split_by_bucket( 11 | bucket: Bucket, 12 | input_files: List[str], 13 | output_path: str, 14 | limit: int, 15 | frame_interval: int, 16 | ): 17 | print(f"Split {len(input_files)} files into {len(bucket)} buckets") 18 | total_limit = len(bucket) * limit 19 | bucket_cnt = {} 20 | # get all bucket id 21 | for hw_id, d in bucket.ar_criteria.items(): 22 | for t_id, v in d.items(): 23 | for ar_id in v.keys(): 24 | bucket_id = (hw_id, t_id, ar_id) 25 | bucket_cnt[bucket_id] = 0 26 | output_df = None 27 | # split files 28 | for path in input_files: 29 | df = pd.read_csv(path) 30 | if output_df is None: 31 | output_df = pd.DataFrame(columns=df.columns) 32 | for i in range(len(df)): 33 | row = df.iloc[i] 34 | t, h, w = row["num_frames"], row["height"], row["width"] 35 | bucket_id = bucket.get_bucket_id(t, h, w, frame_interval) 36 | if bucket_id is None: 37 | continue 38 | if bucket_cnt[bucket_id] < limit: 39 | bucket_cnt[bucket_id] += 1 40 | output_df = pd.concat([output_df, pd.DataFrame([row])], ignore_index=True) 41 | if len(output_df) >= total_limit: 42 | break 43 | if len(output_df) >= total_limit: 44 | break 45 | assert len(output_df) <= total_limit 46 | if len(output_df) == total_limit: 47 | print(f"All buckets are full ({total_limit} samples)") 48 | else: 49 | print(f"Only {len(output_df)} files are used") 50 | output_df.to_csv(output_path, index=False) 51 | 52 | 53 | if __name__ == "__main__": 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument("input", type=str, nargs="+") 56 | parser.add_argument("-o", "--output", required=True) 57 | parser.add_argument("-c", "--config", required=True) 58 | parser.add_argument("-l", "--limit", default=200, type=int) 59 | args = parser.parse_args() 60 | assert args.limit > 0 61 | 62 | cfg = Config.fromfile(args.config) 63 | bucket_config = cfg.bucket_config 64 | # rewrite bucket_config 65 | for ar, d in bucket_config.items(): 66 | for frames, t in d.items(): 67 | p, bs = t 68 | if p > 0.0: 69 | p = 1.0 70 | d[frames] = (p, bs) 71 | bucket = Bucket(bucket_config) 72 | split_by_bucket(bucket, args.input, args.output, args.limit, cfg.dataset.frame_interval) 73 | -------------------------------------------------------------------------------- /tools/frame_interpolation/README.md: -------------------------------------------------------------------------------- 1 | # Frame Interpolation 2 | 3 | For current version, we sample 1 frame out of 3 frames in the video. Although we are going to use VAE to avoid frame loss, we provide a frame interpolation tool to interpolate the video now. The frame interpolation tool is based on [AMT](https://github.com/MCG-NKU/AMT). 4 | 5 | Interpolation can be useful for scenery videos, but it may not be suitable for videos with fast motion. 6 | 7 | ## Requirement 8 | 9 | Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "Frame Interpolation" sections. 10 | 11 | 15 | 16 | ## Model 17 | 18 | We use **AMT** as our frame interpolation model. After sampling, you can use frame interpolation model to interpolate your video smoothly. 19 | 20 | ## Usage 21 | 22 | The ckpt file will be automatically downloaded in user's `.cache` directory. You can use frame interpolation to your video file or a video folder. 23 | 24 | 1. Process a video file 25 | 26 | ```python 27 | python -m tools.frame_interpolation.interpolation your_video.mp4 28 | ``` 29 | 30 | 2. Process all video file in target directory 31 | 32 | ```python 33 | python -m tools.frame_interpolation.interpolation your_video_dir --output_path samples/interpolation 34 | ``` 35 | 36 | The output video will be stored at `output_path` and its duration time is equal `the total number of frames after frame interpolation / the frame rate` 37 | 38 | ### Command Line Arguments 39 | 40 | * `input`: Path of the input video. **Video path** or **Folder path(with --folder)** 41 | * `--ckpt`: Pretrained model of [AMT](https://github.com/MCG-NKU/AMT). Default path: `~/.cache/amt-g.pth`. 42 | * `--niter`: Iterations of interpolation. With $m$ input frames, `[N_ITER]` $=n$ corresponds to $2^n\times (m-1)+1$ output frames. 43 | * `--fps`: Frame rate of the input video. (Default: 8) 44 | * `--output_path`: **Folder Path** of the output video. 45 | -------------------------------------------------------------------------------- /tools/frame_interpolation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/frame_interpolation/__init__.py -------------------------------------------------------------------------------- /tools/frame_interpolation/networks/__init__.py: -------------------------------------------------------------------------------- 1 | from .amt_g import Model 2 | -------------------------------------------------------------------------------- /tools/frame_interpolation/networks/blocks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/frame_interpolation/networks/blocks/__init__.py -------------------------------------------------------------------------------- /tools/frame_interpolation/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/frame_interpolation/utils/__init__.py -------------------------------------------------------------------------------- /tools/frame_interpolation/utils/dist_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | 6 | def get_world_size(): 7 | """Find OMPI world size without calling mpi functions 8 | :rtype: int 9 | """ 10 | if os.environ.get("PMI_SIZE") is not None: 11 | return int(os.environ.get("PMI_SIZE") or 1) 12 | elif os.environ.get("OMPI_COMM_WORLD_SIZE") is not None: 13 | return int(os.environ.get("OMPI_COMM_WORLD_SIZE") or 1) 14 | else: 15 | return torch.cuda.device_count() 16 | 17 | 18 | def get_global_rank(): 19 | """Find OMPI world rank without calling mpi functions 20 | :rtype: int 21 | """ 22 | if os.environ.get("PMI_RANK") is not None: 23 | return int(os.environ.get("PMI_RANK") or 0) 24 | elif os.environ.get("OMPI_COMM_WORLD_RANK") is not None: 25 | return int(os.environ.get("OMPI_COMM_WORLD_RANK") or 0) 26 | else: 27 | return 0 28 | 29 | 30 | def get_local_rank(): 31 | """Find OMPI local rank without calling mpi functions 32 | :rtype: int 33 | """ 34 | if os.environ.get("MPI_LOCALRANKID") is not None: 35 | return int(os.environ.get("MPI_LOCALRANKID") or 0) 36 | elif os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") is not None: 37 | return int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") or 0) 38 | else: 39 | return 0 40 | 41 | 42 | def get_master_ip(): 43 | if os.environ.get("AZ_BATCH_MASTER_NODE") is not None: 44 | return os.environ.get("AZ_BATCH_MASTER_NODE").split(":")[0] 45 | elif os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") is not None: 46 | return os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") 47 | else: 48 | return "127.0.0.1" 49 | -------------------------------------------------------------------------------- /tools/scene_cut/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scene_cut/__init__.py -------------------------------------------------------------------------------- /tools/scene_cut/scene_detect.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from pandarallel import pandarallel 7 | from scenedetect import AdaptiveDetector, detect 8 | from tqdm import tqdm 9 | 10 | tqdm.pandas() 11 | 12 | 13 | def process_single_row(row): 14 | # windows 15 | # from scenedetect import detect, ContentDetector, AdaptiveDetector 16 | 17 | video_path = row["path"] 18 | 19 | detector = AdaptiveDetector( 20 | adaptive_threshold=3.0, 21 | # luma_only=True, 22 | ) 23 | # detector = ContentDetector() 24 | # TODO: catch error here 25 | try: 26 | scene_list = detect(video_path, detector, start_in_scene=True) 27 | timestamp = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list] 28 | return True, str(timestamp) 29 | except Exception as e: 30 | print(f"Video '{video_path}' with error {e}") 31 | return False, "" 32 | 33 | 34 | def parse_args(): 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument("meta_path", type=str) 37 | parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel") 38 | 39 | args = parser.parse_args() 40 | return args 41 | 42 | 43 | def main(): 44 | args = parse_args() 45 | meta_path = args.meta_path 46 | if not os.path.exists(meta_path): 47 | print(f"Meta file '{meta_path}' not found. Exit.") 48 | exit() 49 | 50 | if args.num_workers is not None: 51 | pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers) 52 | else: 53 | pandarallel.initialize(progress_bar=True) 54 | 55 | meta = pd.read_csv(meta_path) 56 | ret = meta.parallel_apply(process_single_row, axis=1) 57 | 58 | succ, timestamps = list(zip(*ret)) 59 | meta["timestamp"] = timestamps 60 | meta = meta[np.array(succ)] 61 | 62 | wo_ext, ext = os.path.splitext(meta_path) 63 | out_path = f"{wo_ext}_timestamp{ext}" 64 | meta.to_csv(out_path, index=False) 65 | print(f"New meta (shape={meta.shape}) with timestamp saved to '{out_path}'.") 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /tools/scoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scoring/__init__.py -------------------------------------------------------------------------------- /tools/scoring/aesthetic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scoring/aesthetic/__init__.py -------------------------------------------------------------------------------- /tools/scoring/matching/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scoring/matching/__init__.py -------------------------------------------------------------------------------- /tools/scoring/ocr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scoring/ocr/__init__.py -------------------------------------------------------------------------------- /tools/scoring/ocr/dbnetpp.py: -------------------------------------------------------------------------------- 1 | model = dict( 2 | type="DBNet", 3 | backbone=dict( 4 | type="CLIPResNet", 5 | depth=50, 6 | num_stages=4, 7 | out_indices=(0, 1, 2, 3), 8 | frozen_stages=-1, 9 | norm_cfg=dict(type="BN", requires_grad=True), 10 | norm_eval=False, 11 | style="pytorch", 12 | dcn=dict(type="DCNv2", deform_groups=1, fallback_on_stride=False), 13 | # init_cfg=dict( 14 | # type='Pretrained', 15 | # checkpoint='https://download.openmmlab.com/mmocr/backbone/resnet50-oclip-7ba0c533.pth'), 16 | stage_with_dcn=(False, True, True, True), 17 | ), 18 | neck=dict( 19 | type="FPNC", 20 | in_channels=[256, 512, 1024, 2048], 21 | lateral_channels=256, 22 | asf_cfg=dict(attention_type="ScaleChannelSpatial"), 23 | ), 24 | det_head=dict( 25 | type="DBHead", 26 | in_channels=256, 27 | module_loss=dict(type="DBModuleLoss"), 28 | postprocessor=dict( 29 | type="DBPostprocessor", 30 | text_repr_type="quad", 31 | epsilon_ratio=0.002, 32 | ), 33 | ), 34 | data_preprocessor=dict( 35 | type="TextDetDataPreprocessor", 36 | mean=[123.675, 116.28, 103.53], 37 | std=[58.395, 57.12, 57.375], 38 | bgr_to_rgb=True, 39 | pad_size_divisor=32, 40 | ), 41 | init_cfg=dict( 42 | type="Pretrained", 43 | checkpoint="https://download.openmmlab.com/mmocr/textdet/dbnetpp/" 44 | "dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/" 45 | "dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth", 46 | ), 47 | ) 48 | 49 | test_pipeline = [ 50 | # dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), 51 | dict(type="Resize", scale=(4068, 1024), keep_ratio=True), 52 | dict( 53 | type="PackTextDetInputs", 54 | # meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor'), 55 | meta_keys=("img_shape", "scale_factor"), 56 | ), 57 | ] 58 | 59 | # Visualization 60 | vis_backends = [dict(type="LocalVisBackend")] 61 | visualizer = dict( 62 | type="TextDetLocalVisualizer", 63 | name="visualizer", 64 | vis_backends=vis_backends, 65 | ) 66 | -------------------------------------------------------------------------------- /tools/scoring/optical_flow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LambdaLabsML/Open-Sora/ee663be7ab16e967d9bd96270008e61c799870d1/tools/scoring/optical_flow/__init__.py -------------------------------------------------------------------------------- /tools/scoring/optical_flow/unimatch/__init__.py: -------------------------------------------------------------------------------- 1 | from .unimatch import UniMatch 2 | -------------------------------------------------------------------------------- /tools/scoring/optical_flow/unimatch/position.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | # https://github.com/facebookresearch/detr/blob/main/models/position_encoding.py 3 | 4 | import math 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | class PositionEmbeddingSine(nn.Module): 11 | """ 12 | This is a more standard version of the position embedding, very similar to the one 13 | used by the Attention is all you need paper, generalized to work on images. 14 | """ 15 | 16 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=True, scale=None): 17 | super().__init__() 18 | self.num_pos_feats = num_pos_feats 19 | self.temperature = temperature 20 | self.normalize = normalize 21 | if scale is not None and normalize is False: 22 | raise ValueError("normalize should be True if scale is passed") 23 | if scale is None: 24 | scale = 2 * math.pi 25 | self.scale = scale 26 | 27 | def forward(self, x): 28 | # x = tensor_list.tensors # [B, C, H, W] 29 | # mask = tensor_list.mask # [B, H, W], input with padding, valid as 0 30 | b, c, h, w = x.size() 31 | mask = torch.ones((b, h, w), device=x.device) # [B, H, W] 32 | y_embed = mask.cumsum(1, dtype=torch.float32) 33 | x_embed = mask.cumsum(2, dtype=torch.float32) 34 | if self.normalize: 35 | eps = 1e-6 36 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 37 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 38 | 39 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 40 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 41 | 42 | pos_x = x_embed[:, :, :, None] / dim_t 43 | pos_y = y_embed[:, :, :, None] / dim_t 44 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 45 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 46 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 47 | return pos 48 | --------------------------------------------------------------------------------