├── Device_conversion ├── README.md ├── configs │ ├── dit │ │ ├── inference │ │ │ ├── 16x256x256.py │ │ │ ├── 1x256x256-class.py │ │ │ └── 1x256x256.py │ │ └── train │ │ │ ├── 16x256x256.py │ │ │ └── 1x256x256.py │ ├── latte │ │ ├── inference │ │ │ ├── 16x256x256-class.py │ │ │ └── 16x256x256.py │ │ └── train │ │ │ └── 16x256x256.py │ ├── opensora-v1-1 │ │ ├── inference │ │ │ ├── sample-ref.py │ │ │ └── sample.py │ │ └── train │ │ │ ├── benchmark.py │ │ │ ├── image.py │ │ │ ├── image_rflow.py │ │ │ ├── stage1.py │ │ │ ├── stage2.py │ │ │ ├── stage3.py │ │ │ └── video.py │ ├── opensora-v1-2 │ │ ├── inference │ │ │ └── sample.py │ │ ├── misc │ │ │ ├── bs.py │ │ │ ├── eval_loss.py │ │ │ ├── extract.py │ │ │ └── feat.py │ │ └── train │ │ │ ├── adapt.py │ │ │ ├── demo_360p.py │ │ │ ├── demo_480p.py │ │ │ ├── stage1.py │ │ │ ├── stage1_feat.py │ │ │ ├── stage2.py │ │ │ └── stage3.py │ ├── opensora │ │ ├── inference │ │ │ ├── 16x256x256.py │ │ │ ├── 16x512x512-rflow.py │ │ │ ├── 16x512x512.py │ │ │ └── 64x512x512.py │ │ └── train │ │ │ ├── 16x256x256-mask.py │ │ │ ├── 16x256x256-spee-rflow.py │ │ │ ├── 16x256x256-spee.py │ │ │ ├── 16x256x256.py │ │ │ ├── 16x512x512.py │ │ │ ├── 360x512x512.py │ │ │ ├── 64x512x512-sp.py │ │ │ └── 64x512x512.py │ ├── pixart │ │ ├── inference │ │ │ ├── 16x256x256.py │ │ │ ├── 1x1024MS.py │ │ │ ├── 1x20481B.py │ │ │ ├── 1x2048MS.py │ │ │ ├── 1x256x256.py │ │ │ ├── 1x512x512-rflow.py │ │ │ └── 1x512x512.py │ │ └── train │ │ │ ├── 16x256x256.py │ │ │ ├── 1x2048x2048.py │ │ │ ├── 1x512x512-rflow.py │ │ │ ├── 1x512x512.py │ │ │ └── 64x512x512.py │ └── vae │ │ ├── inference │ │ ├── image.py │ │ └── video.py │ │ └── train │ │ ├── stage1.py │ │ ├── stage2.py │ │ └── stage3.py ├── coreml-export │ ├── stdit3 │ │ ├── export-stdit3.py │ │ ├── fps.pkl │ │ ├── mask.pkl │ │ └── y.pkl │ ├── t5 │ │ ├── configs │ │ │ └── T5BlockConfig.pkl │ │ ├── quantization.py │ │ ├── t5export.py │ │ └── y_embedding.pkl │ └── vae │ │ ├── export-vae-spatial.py │ │ └── export-vae-temporal.py ├── opensora │ ├── __init__.py │ ├── acceleration │ │ ├── __init__.py │ │ ├── checkpoint.py │ │ ├── communications.py │ │ ├── parallel_states.py │ │ ├── plugin.py │ │ └── shardformer │ │ │ ├── __init__.py │ │ │ ├── modeling │ │ │ ├── __init__.py │ │ │ └── t5.py │ │ │ └── policy │ │ │ ├── __init__.py │ │ │ └── t5_encoder.py │ ├── datasets │ │ ├── __init__.py │ │ ├── aspect.py │ │ ├── bucket.py │ │ ├── dataloader.py │ │ ├── datasets.py │ │ ├── read_video.py │ │ ├── sampler.py │ │ ├── utils.py │ │ └── video_transforms.py │ ├── models │ │ ├── __init__.py │ │ ├── dit │ │ │ ├── __init__.py │ │ │ └── dit.py │ │ ├── latte │ │ │ ├── __init__.py │ │ │ └── latte.py │ │ ├── layers │ │ │ ├── __init__.py │ │ │ ├── blocks.py │ │ │ └── supplementary.py │ │ ├── pixart │ │ │ ├── __init__.py │ │ │ ├── pixart.py │ │ │ └── pixart_sigma.py │ │ ├── stdit │ │ │ ├── __init__.py │ │ │ ├── rotary_embeddings.py │ │ │ ├── stdit.py │ │ │ ├── stdit2.py │ │ │ └── stdit3.py │ │ ├── text_encoder │ │ │ ├── __init__.py │ │ │ ├── classes.py │ │ │ ├── clip.py │ │ │ └── t5.py │ │ └── vae │ │ │ ├── __init__.py │ │ │ ├── autoencoder_kl.py │ │ │ ├── discriminator.py │ │ │ ├── losses.py │ │ │ ├── lpips.py │ │ │ ├── utils.py │ │ │ ├── vae.py │ │ │ ├── vae_spatial.py │ │ │ └── vae_temporal.py │ ├── registry.py │ ├── schedulers │ │ ├── __init__.py │ │ ├── dpms │ │ │ ├── __init__.py │ │ │ └── dpm_solver.py │ │ ├── iddpm │ │ │ ├── __init__.py │ │ │ ├── diffusion_utils.py │ │ │ ├── gaussian_diffusion.py │ │ │ ├── respace.py │ │ │ ├── speed.py │ │ │ └── timestep_sampler.py │ │ └── rf │ │ │ ├── __init__.py │ │ │ └── rectified_flow.py │ └── utils │ │ ├── __init__.py │ │ ├── ckpt_utils.py │ │ ├── config_utils.py │ │ ├── inference_utils.py │ │ ├── lr_scheduler.py │ │ ├── misc.py │ │ └── train_utils.py ├── requirements │ ├── requirements-convert.txt │ ├── requirements-cu121.txt │ ├── requirements-data.txt │ ├── requirements-eval.txt │ ├── requirements-pllava.txt │ ├── requirements-vae.txt │ └── requirements.txt └── setup.py ├── Figures ├── On-device-Sora-Example1.gif └── overview.jpg ├── LICENSE ├── Modded_Open_Sora ├── Dockerfile ├── LICENSE ├── README.md ├── assets │ ├── demo │ │ ├── sample_16s_224x448.gif │ │ ├── sample_16s_320x320.gif │ │ ├── sample_16x240x426_9.gif │ │ ├── sample_32x240x426_7.gif │ │ ├── sample_32x480x854_9.gif │ │ ├── sora_16x240x426_26.gif │ │ ├── sora_16x240x426_27.gif │ │ ├── sora_16x240x426_40.gif │ │ ├── sora_16x426x240_24.gif │ │ ├── sora_16x426x240_3.gif │ │ └── v1.2 │ │ │ ├── sample_0002.gif │ │ │ ├── sample_0004.gif │ │ │ ├── sample_0011.gif │ │ │ ├── sample_0013.gif │ │ │ ├── sample_0052.gif │ │ │ ├── sample_0061.gif │ │ │ ├── sample_0087.gif │ │ │ ├── sample_1718.gif │ │ │ └── sample_1719.gif │ ├── images │ │ ├── condition │ │ │ ├── cactus-happy.png │ │ │ ├── cactus-sad.png │ │ │ ├── cliff.png │ │ │ ├── ship.png │ │ │ ├── sunset1.png │ │ │ ├── sunset2.png │ │ │ └── wave.png │ │ ├── imagenet │ │ │ ├── train │ │ │ │ └── n01440764 │ │ │ │ │ └── n01440764_10026.JPEG │ │ │ └── val │ │ │ │ └── n01440764 │ │ │ │ └── ILSVRC2012_val_00000293.JPEG │ │ ├── ocr │ │ │ ├── demo_text_det.jpg │ │ │ ├── demo_text_ocr.jpg │ │ │ └── demo_text_recog.jpg │ │ └── watermark │ │ │ └── watermark.png │ ├── readme │ │ ├── colossal_ai.png │ │ ├── gradio_advanced.png │ │ ├── gradio_basic.png │ │ ├── gradio_option.png │ │ ├── icon.png │ │ ├── llava_vs_pllava_sample.gif │ │ ├── report-03_actions_count.png │ │ ├── report-03_objects_count.png │ │ ├── report-03_video_stats.png │ │ ├── report_3d_vae.png │ │ ├── report_arch.jpg │ │ ├── report_arch_comp.png │ │ ├── report_bucket.png │ │ ├── report_caption.png │ │ ├── report_data_pipeline.png │ │ ├── report_image_textlen.png │ │ ├── report_loss_curve_1.png │ │ ├── report_loss_curve_2.png │ │ ├── report_loss_curve_3.png │ │ ├── report_mask.png │ │ ├── report_mask_config.png │ │ ├── report_val_loss.png │ │ ├── report_vbench_score.png │ │ ├── report_vid_val_loss.png │ │ ├── report_video_duration.png │ │ ├── report_video_textlen.png │ │ ├── sample_0.gif │ │ ├── sample_1.gif │ │ ├── sample_2.gif │ │ ├── sample_3.gif │ │ ├── sample_4.gif │ │ ├── sample_5.gif │ │ └── sequence_parallelism.jpeg │ └── texts │ │ ├── VBench │ │ ├── all_category.txt │ │ ├── all_dimension.txt │ │ ├── all_i2v.txt │ │ ├── prompts_per_category │ │ │ ├── animal.txt │ │ │ ├── architecture.txt │ │ │ ├── food.txt │ │ │ ├── human.txt │ │ │ ├── lifestyle.txt │ │ │ ├── plant.txt │ │ │ ├── scenery.txt │ │ │ └── vehicles.txt │ │ └── prompts_per_dimension │ │ │ ├── appearance_style.txt │ │ │ ├── color.txt │ │ │ ├── human_action.txt │ │ │ ├── multiple_objects.txt │ │ │ ├── object_class.txt │ │ │ ├── overall_consistency.txt │ │ │ ├── scene.txt │ │ │ ├── spatial_relationship.txt │ │ │ ├── subject_consistency.txt │ │ │ ├── temporal_flickering.txt │ │ │ └── temporal_style.txt │ │ ├── imagenet_id.txt │ │ ├── imagenet_labels.txt │ │ ├── t2i_samples.txt │ │ ├── t2i_sigma.txt │ │ ├── t2v_car.txt │ │ ├── t2v_latte.txt │ │ ├── t2v_pllava.txt │ │ ├── t2v_ref.txt │ │ ├── t2v_samples.txt │ │ ├── t2v_short.txt │ │ ├── t2v_sora.txt │ │ ├── ucf101_id.txt │ │ └── ucf101_labels.txt ├── configs │ ├── dit │ │ ├── inference │ │ │ ├── 16x256x256.py │ │ │ ├── 1x256x256-class.py │ │ │ └── 1x256x256.py │ │ └── train │ │ │ ├── 16x256x256.py │ │ │ └── 1x256x256.py │ ├── latte │ │ ├── inference │ │ │ ├── 16x256x256-class.py │ │ │ └── 16x256x256.py │ │ └── train │ │ │ └── 16x256x256.py │ ├── opensora-v1-1 │ │ ├── inference │ │ │ ├── sample-ref.py │ │ │ └── sample.py │ │ └── train │ │ │ ├── benchmark.py │ │ │ ├── image.py │ │ │ ├── image_rflow.py │ │ │ ├── stage1.py │ │ │ ├── stage2.py │ │ │ ├── stage3.py │ │ │ └── video.py │ ├── opensora-v1-2 │ │ ├── inference │ │ │ ├── sample.py │ │ │ ├── test_config.py │ │ │ └── vae_test.py │ │ ├── misc │ │ │ ├── bs.py │ │ │ ├── eval_loss.py │ │ │ ├── extract.py │ │ │ └── feat.py │ │ └── train │ │ │ ├── adapt.py │ │ │ ├── demo_360p.py │ │ │ ├── demo_480p.py │ │ │ ├── stage1.py │ │ │ ├── stage1_feat.py │ │ │ ├── stage2.py │ │ │ └── stage3.py │ ├── opensora │ │ ├── inference │ │ │ ├── 16x256x256.py │ │ │ ├── 16x512x512-rflow.py │ │ │ ├── 16x512x512.py │ │ │ └── 64x512x512.py │ │ └── train │ │ │ ├── 16x256x256-mask.py │ │ │ ├── 16x256x256-spee-rflow.py │ │ │ ├── 16x256x256-spee.py │ │ │ ├── 16x256x256.py │ │ │ ├── 16x512x512.py │ │ │ ├── 360x512x512.py │ │ │ ├── 64x512x512-sp.py │ │ │ └── 64x512x512.py │ ├── pixart │ │ ├── inference │ │ │ ├── 16x256x256.py │ │ │ ├── 1x1024MS.py │ │ │ ├── 1x20481B.py │ │ │ ├── 1x2048MS.py │ │ │ ├── 1x256x256.py │ │ │ ├── 1x512x512-rflow.py │ │ │ └── 1x512x512.py │ │ └── train │ │ │ ├── 16x256x256.py │ │ │ ├── 1x2048x2048.py │ │ │ ├── 1x512x512-rflow.py │ │ │ ├── 1x512x512.py │ │ │ └── 64x512x512.py │ └── vae │ │ ├── inference │ │ ├── image.py │ │ └── video.py │ │ └── train │ │ ├── stage1.py │ │ ├── stage2.py │ │ └── stage3.py ├── coreml-export │ ├── t5 │ │ ├── model │ │ │ ├── tokenizer.pth │ │ │ └── tokenizer_config.json │ │ ├── t5block.py │ │ ├── t5stack.py │ │ └── tokenizer.py │ └── vae │ │ └── vae.py ├── docs │ ├── acceleration.md │ ├── commands.md │ ├── config.md │ ├── data_processing.md │ ├── datasets.md │ ├── installation.md │ ├── report_01.md │ ├── report_02.md │ ├── report_03.md │ ├── structure.md │ ├── vae.md │ └── zh_CN │ │ ├── README.md │ │ ├── READMEv1.1.md │ │ ├── acceleration.md │ │ ├── commands.md │ │ ├── datasets.md │ │ ├── report_v1.md │ │ ├── report_v2.md │ │ ├── report_v3.md │ │ ├── structure.md │ │ └── vae.md ├── eval │ ├── README.md │ ├── human_eval │ │ ├── generate.sh │ │ └── launch.sh │ ├── loss │ │ ├── eval_loss.py │ │ ├── launch.sh │ │ └── tabulate_rl_loss.py │ ├── sample.sh │ ├── vae │ │ ├── cal_flolpips.py │ │ ├── cal_lpips.py │ │ ├── cal_psnr.py │ │ ├── cal_ssim.py │ │ ├── eval_common_metric.py │ │ ├── flolpips │ │ │ ├── correlation │ │ │ │ └── correlation.py │ │ │ ├── flolpips.py │ │ │ ├── pretrained_networks.py │ │ │ ├── pwcnet.py │ │ │ └── utils.py │ │ └── script │ │ │ └── eval.sh │ ├── vbench │ │ ├── VBench_full_info.json │ │ ├── calc_vbench.py │ │ ├── launch.sh │ │ ├── launch_calc.sh │ │ └── tabulate_vbench_scores.py │ └── vbench_i2v │ │ ├── calc_vbench_i2v.py │ │ ├── json_to_txt.py │ │ ├── launch.sh │ │ ├── launch_calc.sh │ │ ├── tabulate_vbench_i2v_scores.py │ │ └── vbench2_i2v_full_info.json ├── gradio │ ├── README.md │ ├── app.py │ └── requirements.txt ├── notebooks │ ├── inference.ipynb │ └── launch.ipynb ├── opensora │ ├── __init__.py │ ├── acceleration │ │ ├── __init__.py │ │ ├── checkpoint.py │ │ ├── communications.py │ │ ├── parallel_states.py │ │ ├── plugin.py │ │ └── shardformer │ │ │ ├── __init__.py │ │ │ ├── modeling │ │ │ ├── __init__.py │ │ │ └── t5.py │ │ │ └── policy │ │ │ ├── __init__.py │ │ │ └── t5_encoder.py │ ├── datasets │ │ ├── __init__.py │ │ ├── aspect.py │ │ ├── bucket.py │ │ ├── dataloader.py │ │ ├── datasets.py │ │ ├── read_video.py │ │ ├── sampler.py │ │ ├── utils.py │ │ └── video_transforms.py │ ├── models │ │ ├── __init__.py │ │ ├── dit │ │ │ ├── __init__.py │ │ │ └── dit.py │ │ ├── latte │ │ │ ├── __init__.py │ │ │ └── latte.py │ │ ├── layers │ │ │ ├── __init__.py │ │ │ ├── blocks.py │ │ │ └── supplementary.py │ │ ├── pixart │ │ │ ├── __init__.py │ │ │ ├── pixart.py │ │ │ └── pixart_sigma.py │ │ ├── stdit │ │ │ ├── __init__.py │ │ │ ├── merge.py │ │ │ ├── rotary_embeddings.py │ │ │ ├── stdit.py │ │ │ ├── stdit2.py │ │ │ ├── stdit3.py │ │ │ └── utils.py │ │ ├── stdit_origin │ │ │ ├── __init__.py │ │ │ ├── rotary_embeddings.py │ │ │ ├── stdit.py │ │ │ ├── stdit2.py │ │ │ └── stdit3.py │ │ ├── text_encoder │ │ │ ├── __init__.py │ │ │ ├── classes.py │ │ │ ├── clip.py │ │ │ └── t5.py │ │ └── vae │ │ │ ├── __init__.py │ │ │ ├── discriminator.py │ │ │ ├── losses.py │ │ │ ├── lpips.py │ │ │ ├── utils.py │ │ │ ├── vae.py │ │ │ └── vae_temporal.py │ ├── registry.py │ ├── schedulers │ │ ├── __init__.py │ │ ├── dpms │ │ │ ├── __init__.py │ │ │ └── dpm_solver.py │ │ ├── iddpm │ │ │ ├── __init__.py │ │ │ ├── diffusion_utils.py │ │ │ ├── gaussian_diffusion.py │ │ │ ├── respace.py │ │ │ ├── speed.py │ │ │ └── timestep_sampler.py │ │ └── rf │ │ │ ├── __init__.py │ │ │ └── rectified_flow.py │ └── utils │ │ ├── __init__.py │ │ ├── ckpt_utils.py │ │ ├── config_utils.py │ │ ├── inference_utils.py │ │ ├── lr_scheduler.py │ │ ├── misc.py │ │ └── train_utils.py ├── requirements │ ├── requirements-cu121.txt │ ├── requirements-data.txt │ ├── requirements-eval.txt │ ├── requirements-pllava.txt │ ├── requirements-vae.txt │ └── requirements.txt ├── run.sh ├── scripts │ ├── inference.py │ ├── inference_vae.py │ ├── misc │ │ ├── extract_feat.py │ │ ├── launch_extract_feat.sh │ │ ├── launch_search_bs.sh │ │ ├── profile_train.py │ │ └── search_bs.py │ ├── train.py │ ├── train_vae.py │ └── vae_check.py ├── setup.py ├── tests │ ├── test_attn.py │ ├── test_lr_scheduler.py │ ├── test_np_torch.py │ ├── test_pos_emb.py │ ├── test_seq_parallel_attention.py │ ├── test_stdit3_sequence_parallelism.py │ └── test_t5_shardformer.py └── tools │ ├── __init__.py │ ├── caption │ ├── README.md │ ├── __init__.py │ ├── acceleration │ │ ├── __init__.py │ │ └── llava │ │ │ ├── __init__.py │ │ │ └── policies │ │ │ ├── __init__.py │ │ │ ├── llama.py │ │ │ └── mistral.py │ ├── camera_motion │ │ ├── __init__.py │ │ ├── camera_motion.py │ │ ├── detect.py │ │ ├── requirements.txt │ │ ├── utils.py │ │ └── visualizer.py │ ├── camera_motion_detect.py │ ├── caption_gpt4.py │ ├── caption_llama3.py │ ├── caption_llava.py │ ├── pllava_dir │ │ └── caption_pllava.py │ └── utils.py │ ├── datasets │ ├── README.md │ ├── __init__.py │ ├── analyze.py │ ├── convert.py │ ├── datautil.py │ ├── filter_panda10m.py │ ├── split.py │ ├── transform.py │ └── utils.py │ ├── frame_interpolation │ ├── README.md │ ├── __init__.py │ ├── interpolation.py │ ├── networks │ │ ├── __init__.py │ │ ├── amt_g.py │ │ └── blocks │ │ │ ├── __init__.py │ │ │ ├── feat_enc.py │ │ │ ├── ifrnet.py │ │ │ ├── multi_flow.py │ │ │ └── raft.py │ └── utils │ │ ├── __init__.py │ │ ├── dist_utils.py │ │ ├── flow_utils.py │ │ └── utils.py │ ├── scene_cut │ ├── README.md │ ├── __init__.py │ ├── convert_id_to_path.py │ ├── cut.py │ └── scene_detect.py │ └── scoring │ ├── README.md │ ├── __init__.py │ ├── aesthetic │ ├── __init__.py │ └── inference.py │ ├── matching │ ├── __init__.py │ └── inference.py │ ├── ocr │ ├── __init__.py │ ├── dbnetpp.py │ └── inference.py │ └── optical_flow │ ├── __init__.py │ ├── inference.py │ └── unimatch │ ├── __init__.py │ ├── attention.py │ ├── backbone.py │ ├── geometry.py │ ├── matching.py │ ├── position.py │ ├── reg_refine.py │ ├── transformer.py │ ├── trident_conv.py │ ├── unimatch.py │ └── utils.py ├── On-device ├── On-device-Sora.xcodeproj │ ├── project.pbxproj │ └── project.xcworkspace │ │ ├── contents.xcworkspacedata │ │ └── xcshareddata │ │ └── swiftpm │ │ └── Package.resolved ├── On-device-Sora │ ├── Assets.xcassets │ │ ├── AccentColor.colorset │ │ │ └── Contents.json │ │ ├── AppIcon.appiconset │ │ │ └── Contents.json │ │ └── Contents.json │ ├── ContentView.swift │ ├── ManagedMLModel.swift │ ├── MemoryInfo.swift │ ├── On_device_SoraApp.swift │ ├── Preview Content │ │ └── Preview Assets.xcassets │ │ │ └── Contents.json │ ├── RFLOW.swift │ ├── RFlowScheduler.swift │ ├── STDiT.swift │ ├── SoraPipeline.swift │ ├── T5Tokenizer.swift │ ├── Tensor2Vid.swift │ ├── TextEncoding.swift │ ├── VAEDecoder.swift │ ├── VideoPlayerView.swift │ ├── tokenizer.json │ └── tokenizer_config.json └── README.md └── README.md /Device_conversion/README.md: -------------------------------------------------------------------------------- 1 | ## How to convert each model to MLPackage for On-device Sora 2 | 3 | ## Package Dependencies 4 | 5 | ### Dependency 6 | ``` 7 | conda create -n convert python=3.10 8 | 9 | conda activate convert 10 | 11 | pip install -r requirements/requirements-convert.txt 12 | 13 | pip install -v . 14 | ``` 15 | 16 | ## Converting 17 | 18 | ### T5 Converting 19 | ``` 20 | cd t5 21 | python3 export-t5.py 22 | ``` 23 | 24 | ### STDiT Convering 25 | ``` 26 | cd stdit3 27 | python3 export-stdit3.py 28 | ``` 29 | 30 | ### VAE Converting 31 | When you run `export-vae-spatial.py`, There are some error that is `Fatal Python error: PyEval_SaveThread`. 32 | To address this error, you should only run one code block for each VAE part. Comment out the rest. 33 | 34 | ``` 35 | cd vae 36 | 37 | # for vae's temporal part 38 | python3 export-vae-temporal.py 39 | 40 | # for vae's spatial part 41 | python3 export-vae-spatial.py 42 | ``` 43 | -------------------------------------------------------------------------------- /Device_conversion/configs/dit/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="DiT-XL/2", 8 | condition="text", 9 | from_pretrained="PRETRAINED_MODEL", 10 | ) 11 | vae = dict( 12 | type="VideoAutoencoderKL", 13 | from_pretrained="stabilityai/sd-vae-ft-ema", 14 | ) 15 | text_encoder = dict( 16 | type="clip", 17 | from_pretrained="openai/clip-vit-base-patch32", 18 | model_max_length=77, 19 | ) 20 | scheduler = dict( 21 | type="dpm-solver", 22 | num_sampling_steps=20, 23 | cfg_scale=4.0, 24 | ) 25 | dtype = "bf16" 26 | 27 | # Others 28 | batch_size = 2 29 | seed = 42 30 | prompt_path = "./assets/texts/ucf101_labels.txt" 31 | save_dir = "./samples/samples/" 32 | -------------------------------------------------------------------------------- /Device_conversion/configs/dit/inference/1x256x256-class.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="DiT-XL/2", 8 | no_temporal_pos_emb=True, 9 | condition="label_1000", 10 | from_pretrained="DiT-XL-2-256x256.pt", 11 | ) 12 | vae = dict( 13 | type="VideoAutoencoderKL", 14 | from_pretrained="stabilityai/sd-vae-ft-ema", 15 | ) 16 | text_encoder = dict( 17 | type="classes", 18 | num_classes=1000, 19 | ) 20 | scheduler = dict( 21 | type="dpm-solver", 22 | num_sampling_steps=20, 23 | cfg_scale=4.0, 24 | ) 25 | dtype = "bf16" 26 | 27 | # Others 28 | batch_size = 2 29 | seed = 42 30 | prompt_path = "./assets/texts/imagenet_id.txt" 31 | save_dir = "./samples/samples/" 32 | -------------------------------------------------------------------------------- /Device_conversion/configs/dit/inference/1x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="DiT-XL/2", 8 | no_temporal_pos_emb=True, 9 | condition="text", 10 | from_pretrained="PRETRAINED_MODEL", 11 | ) 12 | vae = dict( 13 | type="VideoAutoencoderKL", 14 | from_pretrained="stabilityai/sd-vae-ft-ema", 15 | ) 16 | text_encoder = dict( 17 | type="clip", 18 | from_pretrained="openai/clip-vit-base-patch32", 19 | model_max_length=77, 20 | ) 21 | scheduler = dict( 22 | type="dpm-solver", 23 | num_sampling_steps=20, 24 | cfg_scale=4.0, 25 | ) 26 | dtype = "bf16" 27 | 28 | # Others 29 | batch_size = 2 30 | seed = 42 31 | prompt_path = "./assets/texts/imagenet_labels.txt" 32 | save_dir = "./samples/samples/" 33 | -------------------------------------------------------------------------------- /Device_conversion/configs/dit/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="DiT-XL/2", 20 | from_pretrained="DiT-XL-2-256x256.pt", 21 | enable_flash_attn=True, 22 | enable_layernorm_kernel=True, 23 | ) 24 | vae = dict( 25 | type="VideoAutoencoderKL", 26 | from_pretrained="stabilityai/sd-vae-ft-ema", 27 | ) 28 | text_encoder = dict( 29 | type="clip", 30 | from_pretrained="openai/clip-vit-base-patch32", 31 | model_max_length=77, 32 | ) 33 | scheduler = dict( 34 | type="iddpm", 35 | timestep_respacing="", 36 | ) 37 | 38 | # Others 39 | seed = 42 40 | outputs = "outputs" 41 | wandb = False 42 | 43 | epochs = 1000 44 | log_every = 10 45 | ckpt_every = 1000 46 | load = None 47 | 48 | batch_size = 8 49 | lr = 2e-5 50 | grad_clip = 1.0 51 | -------------------------------------------------------------------------------- /Device_conversion/configs/dit/train/1x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=1, 6 | frame_interval=1, 7 | image_size=(256, 256), 8 | transform_name="center", 9 | ) 10 | 11 | # Define acceleration 12 | num_workers = 4 13 | dtype = "bf16" 14 | grad_checkpoint = False 15 | plugin = "zero2" 16 | sp_size = 1 17 | 18 | # Define model 19 | model = dict( 20 | type="DiT-XL/2", 21 | no_temporal_pos_emb=True, 22 | enable_flash_attn=True, 23 | enable_layernorm_kernel=True, 24 | ) 25 | vae = dict( 26 | type="VideoAutoencoderKL", 27 | from_pretrained="stabilityai/sd-vae-ft-ema", 28 | ) 29 | text_encoder = dict( 30 | type="clip", 31 | from_pretrained="openai/clip-vit-base-patch32", 32 | model_max_length=77, 33 | ) 34 | scheduler = dict( 35 | type="iddpm", 36 | timestep_respacing="", 37 | ) 38 | 39 | # Others 40 | seed = 42 41 | outputs = "outputs" 42 | wandb = False 43 | 44 | epochs = 1000 45 | log_every = 10 46 | ckpt_every = 1000 47 | load = None 48 | 49 | batch_size = 128 50 | lr = 1e-4 # according to DiT repo 51 | grad_clip = 1.0 52 | -------------------------------------------------------------------------------- /Device_conversion/configs/latte/inference/16x256x256-class.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="Latte-XL/2", 8 | condition="label_101", 9 | from_pretrained="Latte-XL-2-256x256-ucf101.pt", 10 | ) 11 | vae = dict( 12 | type="VideoAutoencoderKL", 13 | from_pretrained="stabilityai/sd-vae-ft-ema", 14 | ) 15 | text_encoder = dict( 16 | type="classes", 17 | num_classes=101, 18 | ) 19 | scheduler = dict( 20 | type="dpm-solver", 21 | num_sampling_steps=20, 22 | cfg_scale=4.0, 23 | ) 24 | dtype = "bf16" 25 | 26 | # Others 27 | batch_size = 2 28 | seed = 42 29 | prompt_path = "./assets/texts/ucf101_id.txt" 30 | save_dir = "./samples/samples/" 31 | -------------------------------------------------------------------------------- /Device_conversion/configs/latte/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="Latte-XL/2", 8 | condition="text", 9 | from_pretrained="PRETRAINED_MODEL", 10 | ) 11 | vae = dict( 12 | type="VideoAutoencoderKL", 13 | from_pretrained="stabilityai/sd-vae-ft-ema", 14 | ) 15 | text_encoder = dict( 16 | type="clip", 17 | from_pretrained="openai/clip-vit-base-patch32", 18 | model_max_length=77, 19 | ) 20 | scheduler = dict( 21 | type="dpm-solver", 22 | num_sampling_steps=20, 23 | cfg_scale=4.0, 24 | ) 25 | dtype = "bf16" 26 | 27 | # Others 28 | batch_size = 2 29 | seed = 42 30 | prompt_path = "./assets/texts/ucf101_labels.txt" 31 | save_dir = "./samples/samples/" 32 | -------------------------------------------------------------------------------- /Device_conversion/configs/latte/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="Latte-XL/2", 20 | enable_flash_attn=True, 21 | enable_layernorm_kernel=True, 22 | ) 23 | vae = dict( 24 | type="VideoAutoencoderKL", 25 | from_pretrained="stabilityai/sd-vae-ft-ema", 26 | ) 27 | text_encoder = dict( 28 | type="clip", 29 | from_pretrained="openai/clip-vit-base-patch32", 30 | model_max_length=77, 31 | ) 32 | scheduler = dict( 33 | type="iddpm", 34 | timestep_respacing="", 35 | ) 36 | 37 | # Others 38 | seed = 42 39 | outputs = "outputs" 40 | wandb = False 41 | 42 | epochs = 1000 43 | log_every = 10 44 | ckpt_every = 1000 45 | load = None 46 | 47 | batch_size = 8 48 | lr = 2e-5 49 | grad_clip = 1.0 50 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora-v1-1/inference/sample.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | frame_interval = 3 3 | fps = 24 4 | image_size = (240, 426) 5 | multi_resolution = "STDiT2" 6 | 7 | # Define model 8 | model = dict( 9 | type="STDiT2-XL/2", 10 | from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3", 11 | input_sq_size=512, 12 | qk_norm=True, 13 | qk_norm_legacy=True, 14 | enable_flash_attn=True, 15 | enable_layernorm_kernel=True, 16 | ) 17 | vae = dict( 18 | type="VideoAutoencoderKL", 19 | from_pretrained="stabilityai/sd-vae-ft-ema", 20 | cache_dir=None, # "/mnt/hdd/cached_models", 21 | micro_batch_size=4, 22 | ) 23 | text_encoder = dict( 24 | type="t5", 25 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 26 | cache_dir=None, # "/mnt/hdd/cached_models", 27 | model_max_length=200, 28 | ) 29 | scheduler = dict( 30 | type="iddpm", 31 | num_sampling_steps=100, 32 | cfg_scale=7.0, 33 | cfg_channel=3, # or None 34 | ) 35 | dtype = "bf16" 36 | 37 | # Condition 38 | prompt_path = "./assets/texts/t2v_samples.txt" 39 | prompt = None # prompt has higher priority than prompt_path 40 | 41 | # Others 42 | batch_size = 1 43 | seed = 42 44 | save_dir = "./samples/samples/" 45 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora-v1-1/train/image.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | data_path=None, 5 | num_frames=None, 6 | frame_interval=3, 7 | image_size=(None, None), 8 | transform_name="resize_crop", 9 | ) 10 | bucket_config = { # 6s/it 11 | "256": {1: (1.0, 256)}, 12 | "512": {1: (1.0, 80)}, 13 | "480p": {1: (1.0, 52)}, 14 | "1024": {1: (1.0, 20)}, 15 | "1080p": {1: (1.0, 8)}, 16 | } 17 | 18 | # Define acceleration 19 | num_workers = 4 20 | num_bucket_build_workers = 16 21 | dtype = "bf16" 22 | grad_checkpoint = True 23 | plugin = "zero2" 24 | sp_size = 1 25 | 26 | # Define model 27 | model = dict( 28 | type="STDiT2-XL/2", 29 | from_pretrained=None, 30 | input_sq_size=512, # pretrained model is trained on 512x512 31 | qk_norm=True, 32 | qk_norm_legacy=True, 33 | enable_flash_attn=True, 34 | enable_layernorm_kernel=True, 35 | ) 36 | vae = dict( 37 | type="VideoAutoencoderKL", 38 | from_pretrained="stabilityai/sd-vae-ft-ema", 39 | micro_batch_size=4, 40 | local_files_only=True, 41 | ) 42 | text_encoder = dict( 43 | type="t5", 44 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 45 | model_max_length=200, 46 | shardformer=True, 47 | local_files_only=True, 48 | ) 49 | scheduler = dict( 50 | type="iddpm", 51 | timestep_respacing="", 52 | ) 53 | 54 | # Others 55 | seed = 42 56 | outputs = "outputs" 57 | wandb = False 58 | 59 | epochs = 1000 60 | log_every = 10 61 | ckpt_every = 500 62 | load = None 63 | 64 | batch_size = 10 # only for logging 65 | lr = 2e-5 66 | grad_clip = 1.0 67 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora-v1-1/train/video.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | data_path=None, 5 | num_frames=None, 6 | frame_interval=3, 7 | image_size=(None, None), 8 | transform_name="resize_crop", 9 | ) 10 | bucket_config = { # 6s/it 11 | "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)}, 12 | "256": {1: (1.0, 256)}, 13 | "512": {1: (0.5, 80)}, 14 | "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)}, 15 | "720p": {16: (0.1, 2), 32: (0.0, None)}, # No examples now 16 | "1024": {1: (0.3, 20)}, 17 | "1080p": {1: (0.3, 8)}, 18 | } 19 | 20 | # Define acceleration 21 | num_workers = 4 22 | num_bucket_build_workers = 16 23 | dtype = "bf16" 24 | grad_checkpoint = True 25 | plugin = "zero2" 26 | sp_size = 1 27 | 28 | # Define model 29 | model = dict( 30 | type="STDiT2-XL/2", 31 | from_pretrained=None, 32 | input_sq_size=512, # pretrained model is trained on 512x512 33 | qk_norm=True, 34 | qk_norm_legacy=True, 35 | enable_flash_attn=True, 36 | enable_layernorm_kernel=True, 37 | ) 38 | vae = dict( 39 | type="VideoAutoencoderKL", 40 | from_pretrained="stabilityai/sd-vae-ft-ema", 41 | micro_batch_size=4, 42 | local_files_only=True, 43 | ) 44 | text_encoder = dict( 45 | type="t5", 46 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 47 | model_max_length=200, 48 | shardformer=True, 49 | local_files_only=True, 50 | ) 51 | scheduler = dict( 52 | type="iddpm", 53 | timestep_respacing="", 54 | ) 55 | 56 | # Others 57 | seed = 42 58 | outputs = "outputs" 59 | wandb = False 60 | 61 | epochs = 1000 62 | log_every = 10 63 | ckpt_every = 500 64 | load = None 65 | 66 | batch_size = 10 # only for logging 67 | lr = 2e-5 68 | grad_clip = 1.0 69 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora-v1-2/inference/sample.py: -------------------------------------------------------------------------------- 1 | resolution = "240p" 2 | aspect_ratio = "9:16" 3 | num_frames = 51 4 | fps = 24 5 | frame_interval = 1 6 | save_fps = 24 7 | 8 | save_dir = "./samples/samples/" 9 | seed = 42 10 | batch_size = 1 11 | multi_resolution = "STDiT2" 12 | dtype = "bf16" 13 | condition_frame_length = 5 14 | align = 5 15 | 16 | model = dict( 17 | type="STDiT3-XL/2", 18 | from_pretrained="hpcai-tech/OpenSora-STDiT-v3", 19 | qk_norm=True, 20 | enable_flash_attn=True, 21 | enable_layernorm_kernel=True, 22 | force_huggingface=True, 23 | ) 24 | vae = dict( 25 | type="OpenSoraVAE_V1_2", 26 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 27 | micro_frame_size=17, 28 | micro_batch_size=4, 29 | force_huggingface=True, 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=300, 35 | ) 36 | scheduler = dict( 37 | type="rflow", 38 | use_timestep_transform=True, 39 | num_sampling_steps=30, 40 | cfg_scale=7.0, 41 | ) 42 | 43 | aes = 6.5 44 | flow = None 45 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora-v1-2/misc/eval_loss.py: -------------------------------------------------------------------------------- 1 | num_workers = 8 2 | dtype = "bf16" 3 | seed = 42 4 | num_eval_timesteps = 10 5 | 6 | # Dataset settings 7 | dataset = dict( 8 | type="VariableVideoTextDataset", 9 | transform_name="resize_crop", 10 | ) 11 | 12 | bucket_config = { 13 | "144p": {1: (None, 100), 51: (None, 30), 102: (None, 20), 204: (None, 8), 408: (None, 4)}, 14 | # --- 15 | "240p": {1: (None, 100), 51: (None, 24), 102: (None, 12), 204: (None, 4), 408: (None, 2)}, 16 | # --- 17 | "360p": {1: (None, 60), 51: (None, 12), 102: (None, 6), 204: (None, 2), 408: (None, 1)}, 18 | # --- 19 | "480p": {1: (None, 40), 51: (None, 6), 102: (None, 3), 204: (None, 1)}, 20 | # --- 21 | "720p": {1: (None, 20), 51: (None, 2), 102: (None, 1)}, 22 | # --- 23 | "1080p": {1: (None, 10)}, 24 | # --- 25 | "2048": {1: (None, 5)}, 26 | } 27 | 28 | # Model settings 29 | model = dict( 30 | type="STDiT3-XL/2", 31 | from_pretrained=None, 32 | qk_norm=True, 33 | enable_flash_attn=True, 34 | enable_layernorm_kernel=True, 35 | ) 36 | vae = dict( 37 | type="OpenSoraVAE_V1_2", 38 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 39 | micro_frame_size=17, 40 | micro_batch_size=4, 41 | local_files_only=True, 42 | ) 43 | text_encoder = dict( 44 | type="t5", 45 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 46 | model_max_length=300, 47 | local_files_only=True, 48 | ) 49 | scheduler = dict(type="rflow") 50 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora-v1-2/misc/extract.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = { # 12s/it 9 | "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, 10 | # --- 11 | "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)}, 12 | "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)}, 13 | # --- 14 | "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)}, 15 | "512": {1: (0.1, 141)}, 16 | # --- 17 | "480p": {1: (0.1, 89)}, 18 | # --- 19 | "720p": {1: (0.05, 36)}, 20 | "1024": {1: (0.05, 36)}, 21 | # --- 22 | "1080p": {1: (0.1, 5)}, 23 | # --- 24 | "2048": {1: (0.1, 5)}, 25 | } 26 | 27 | # Acceleration settings 28 | num_workers = 8 29 | num_bucket_build_workers = 16 30 | dtype = "bf16" 31 | seed = 42 32 | outputs = "outputs" 33 | wandb = False 34 | 35 | 36 | # Model settings 37 | model = dict( 38 | type="STDiT3-XL/2", 39 | from_pretrained="hpcai-tech/OpenSora-STDiT-v3", 40 | qk_norm=True, 41 | enable_flash_attn=True, 42 | enable_layernorm_kernel=True, 43 | ) 44 | vae = dict( 45 | type="OpenSoraVAE_V1_2", 46 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 47 | micro_frame_size=17, 48 | micro_batch_size=32, 49 | ) 50 | text_encoder = dict( 51 | type="t5", 52 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 53 | model_max_length=300, 54 | shardformer=True, 55 | local_files_only=True, 56 | ) 57 | 58 | # feature extraction settings 59 | save_text_features = True 60 | save_compressed_text_features = True 61 | bin_size = 250 # 1GB, 4195 bins 62 | log_time = False 63 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora-v1-2/train/demo_360p.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = {"360p": {102: (1.0, 5)}} 9 | grad_checkpoint = True 10 | 11 | # Acceleration settings 12 | num_workers = 8 13 | num_bucket_build_workers = 16 14 | dtype = "bf16" 15 | plugin = "zero2" 16 | 17 | # Model settings 18 | model = dict( 19 | type="STDiT3-XL/2", 20 | from_pretrained=None, 21 | qk_norm=True, 22 | enable_flash_attn=True, 23 | enable_layernorm_kernel=True, 24 | freeze_y_embedder=True, 25 | ) 26 | vae = dict( 27 | type="OpenSoraVAE_V1_2", 28 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 29 | micro_frame_size=17, 30 | micro_batch_size=4, 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=300, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="rflow", 40 | use_timestep_transform=True, 41 | sample_method="logit-normal", 42 | ) 43 | 44 | # Log settings 45 | seed = 42 46 | outputs = "outputs" 47 | wandb = False 48 | epochs = 1000 49 | log_every = 10 50 | ckpt_every = 200 51 | 52 | # optimization settings 53 | load = None 54 | grad_clip = 1.0 55 | lr = 1e-4 56 | ema_decay = 0.99 57 | adam_eps = 1e-15 58 | warmup_steps = 1000 59 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora-v1-2/train/demo_480p.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = {"480p": {51: (0.5, 5)}} 9 | grad_checkpoint = True 10 | 11 | # Acceleration settings 12 | num_workers = 8 13 | num_bucket_build_workers = 16 14 | dtype = "bf16" 15 | plugin = "zero2" 16 | 17 | # Model settings 18 | model = dict( 19 | type="STDiT3-XL/2", 20 | from_pretrained=None, 21 | qk_norm=True, 22 | enable_flash_attn=True, 23 | enable_layernorm_kernel=True, 24 | freeze_y_embedder=True, 25 | ) 26 | vae = dict( 27 | type="OpenSoraVAE_V1_2", 28 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 29 | micro_frame_size=17, 30 | micro_batch_size=4, 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=300, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="rflow", 40 | use_timestep_transform=True, 41 | sample_method="logit-normal", 42 | ) 43 | 44 | # Log settings 45 | seed = 42 46 | outputs = "outputs" 47 | wandb = False 48 | epochs = 1000 49 | log_every = 10 50 | ckpt_every = 200 51 | 52 | # optimization settings 53 | load = None 54 | grad_clip = 1.0 55 | lr = 1e-4 56 | ema_decay = 0.99 57 | adam_eps = 1e-15 58 | warmup_steps = 1000 59 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora-v1-2/train/stage1_feat.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict(type="BatchFeatureDataset") 3 | grad_checkpoint = True 4 | num_workers = 4 5 | 6 | # Acceleration settings 7 | dtype = "bf16" 8 | plugin = "zero2" 9 | 10 | # Model settings 11 | model = dict( 12 | type="STDiT3-XL/2", 13 | from_pretrained=None, 14 | qk_norm=True, 15 | enable_flash_attn=True, 16 | enable_layernorm_kernel=True, 17 | freeze_y_embedder=True, 18 | skip_y_embedder=True, 19 | ) 20 | scheduler = dict( 21 | type="rflow", 22 | use_timestep_transform=True, 23 | sample_method="logit-normal", 24 | ) 25 | 26 | vae_out_channels = 4 27 | model_max_length = 300 28 | text_encoder_output_dim = 4096 29 | load_video_features = True 30 | load_text_features = True 31 | 32 | # Mask settings 33 | mask_ratios = { 34 | "random": 0.2, 35 | "intepolate": 0.01, 36 | "quarter_random": 0.01, 37 | "quarter_head": 0.01, 38 | "quarter_tail": 0.01, 39 | "quarter_head_tail": 0.01, 40 | "image_random": 0.05, 41 | "image_head": 0.1, 42 | "image_tail": 0.05, 43 | "image_head_tail": 0.05, 44 | } 45 | 46 | # Log settings 47 | seed = 42 48 | outputs = "outputs" 49 | wandb = False 50 | epochs = 1000 51 | log_every = 10 52 | ckpt_every = 500 53 | 54 | # optimization settings 55 | load = None 56 | grad_clip = 1.0 57 | lr = 2e-4 58 | ema_decay = 0.99 59 | adam_eps = 1e-15 60 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 24 // 3 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=0.5, 9 | time_scale=1.0, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=4, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="iddpm", 26 | num_sampling_steps=100, 27 | cfg_scale=7.0, 28 | cfg_channel=3, # or None 29 | ) 30 | dtype = "bf16" 31 | 32 | # Condition 33 | prompt_path = "./assets/texts/t2v_samples.txt" 34 | prompt = None # prompt has higher priority than prompt_path 35 | 36 | # Others 37 | batch_size = 1 38 | seed = 42 39 | save_dir = "./samples/samples/" 40 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/inference/16x512x512-rflow.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 24 // 3 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=2, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="rflow", 26 | num_sampling_steps=10, 27 | cfg_scale=7.0, 28 | ) 29 | dtype = "bf16" 30 | 31 | # Others 32 | batch_size = 2 33 | seed = 42 34 | prompt_path = "./assets/texts/t2v_samples.txt" 35 | save_dir = "./outputs/samples/" 36 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/inference/16x512x512.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 24 // 3 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=2, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="iddpm", 26 | num_sampling_steps=100, 27 | cfg_scale=7.0, 28 | ) 29 | dtype = "bf16" 30 | 31 | # Others 32 | batch_size = 2 33 | seed = 42 34 | prompt_path = "./assets/texts/t2v_samples.txt" 35 | save_dir = "./samples/samples/" 36 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/inference/64x512x512.py: -------------------------------------------------------------------------------- 1 | num_frames = 64 2 | fps = 24 // 2 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=1.0, 9 | time_scale=2 / 3, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=128, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="iddpm", 26 | num_sampling_steps=100, 27 | cfg_scale=7.0, 28 | ) 29 | dtype = "bf16" 30 | 31 | # Others 32 | batch_size = 1 33 | seed = 42 34 | prompt_path = "./assets/texts/t2v_samples.txt" 35 | save_dir = "./samples/samples/" 36 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/train/16x256x256-mask.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | mask_ratios = { 27 | "identity": 0.7, 28 | "random": 0.15, 29 | "mask_head": 0.05, 30 | "mask_tail": 0.05, 31 | "mask_head_tail": 0.05, 32 | } 33 | vae = dict( 34 | type="VideoAutoencoderKL", 35 | from_pretrained="stabilityai/sd-vae-ft-ema", 36 | ) 37 | text_encoder = dict( 38 | type="t5", 39 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 40 | model_max_length=120, 41 | shardformer=True, 42 | ) 43 | scheduler = dict( 44 | type="iddpm", 45 | timestep_respacing="", 46 | ) 47 | 48 | # Others 49 | seed = 42 50 | outputs = "outputs" 51 | wandb = False 52 | 53 | epochs = 1000 54 | log_every = 10 55 | ckpt_every = 1000 56 | load = None 57 | 58 | batch_size = 8 59 | lr = 2e-5 60 | grad_clip = 1.0 61 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/train/16x256x256-spee-rflow.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | # from_pretrained="PixArt-XL-2-512x512.pth", 23 | # from_pretrained = "/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/OpenSora-v1-HQ-16x512x512.pth", 24 | # from_pretrained = "OpenSora-v1-HQ-16x512x512.pth", 25 | from_pretrained="PRETRAINED_MODEL", 26 | enable_flash_attn=True, 27 | enable_layernorm_kernel=True, 28 | ) 29 | # mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07] 30 | # mask_ratios = { 31 | # "identity": 0.9, 32 | # "random": 0.06, 33 | # "mask_head": 0.01, 34 | # "mask_tail": 0.01, 35 | # "mask_head_tail": 0.02, 36 | # } 37 | vae = dict( 38 | type="VideoAutoencoderKL", 39 | from_pretrained="stabilityai/sd-vae-ft-ema", 40 | ) 41 | text_encoder = dict( 42 | type="t5", 43 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 44 | model_max_length=120, 45 | shardformer=True, 46 | ) 47 | scheduler = dict( 48 | type="rflow", 49 | # timestep_respacing="", 50 | ) 51 | 52 | # Others 53 | seed = 42 54 | outputs = "outputs" 55 | wandb = True 56 | 57 | epochs = 1 58 | log_every = 10 59 | ckpt_every = 1000 60 | load = None 61 | 62 | batch_size = 16 63 | lr = 2e-5 64 | grad_clip = 1.0 65 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/train/16x256x256-spee.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | mask_ratios = { 27 | "identity": 0.5, 28 | "random": 0.29, 29 | "mask_head": 0.07, 30 | "mask_tail": 0.07, 31 | "mask_head_tail": 0.07, 32 | } 33 | vae = dict( 34 | type="VideoAutoencoderKL", 35 | from_pretrained="stabilityai/sd-vae-ft-ema", 36 | ) 37 | text_encoder = dict( 38 | type="t5", 39 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 40 | model_max_length=120, 41 | shardformer=True, 42 | ) 43 | scheduler = dict( 44 | type="iddpm-speed", 45 | timestep_respacing="", 46 | ) 47 | 48 | # Others 49 | seed = 42 50 | outputs = "outputs" 51 | wandb = False 52 | 53 | epochs = 1000 54 | log_every = 10 55 | ckpt_every = 1000 56 | load = None 57 | 58 | batch_size = 8 59 | lr = 2e-5 60 | grad_clip = 1.0 61 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 0 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | ) 30 | text_encoder = dict( 31 | type="t5", 32 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 33 | model_max_length=120, 34 | shardformer=True, 35 | ) 36 | scheduler = dict( 37 | type="iddpm", 38 | timestep_respacing="", 39 | ) 40 | 41 | # Others 42 | seed = 42 43 | outputs = "outputs" 44 | wandb = False 45 | 46 | epochs = 1000 47 | log_every = 10 48 | ckpt_every = 1000 49 | load = None 50 | 51 | batch_size = 8 52 | lr = 2e-5 53 | grad_clip = 1.0 54 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/train/16x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=1.0, 21 | time_scale=1.0, 22 | from_pretrained=None, 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | micro_batch_size=128, 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 500 50 | load = None 51 | 52 | batch_size = 8 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/train/360x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=360, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define acceleration 18 | dtype = "bf16" 19 | grad_checkpoint = True 20 | plugin = "zero2-seq" 21 | sp_size = 2 22 | 23 | # Define model 24 | model = dict( 25 | type="STDiT-XL/2", 26 | space_scale=1.0, 27 | time_scale=2 / 3, 28 | from_pretrained=None, 29 | enable_flash_attn=True, 30 | enable_layernorm_kernel=True, 31 | enable_sequence_parallelism=True, # enable sq here 32 | ) 33 | vae = dict( 34 | type="VideoAutoencoderKL", 35 | from_pretrained="stabilityai/sd-vae-ft-ema", 36 | micro_batch_size=128, 37 | ) 38 | text_encoder = dict( 39 | type="t5", 40 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 41 | model_max_length=120, 42 | shardformer=True, 43 | ) 44 | scheduler = dict( 45 | type="iddpm", 46 | timestep_respacing="", 47 | ) 48 | 49 | # Others 50 | seed = 42 51 | outputs = "outputs" 52 | wandb = False 53 | 54 | epochs = 1000 55 | log_every = 10 56 | ckpt_every = 250 57 | load = None 58 | 59 | batch_size = 1 60 | lr = 2e-5 61 | grad_clip = 1.0 62 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/train/64x512x512-sp.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 2 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=1.0, 21 | time_scale=2 / 3, 22 | from_pretrained=None, 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | enable_sequence_parallelism=True, # enable sq here 26 | ) 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="stabilityai/sd-vae-ft-ema", 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 1000 50 | load = None 51 | 52 | batch_size = 1 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /Device_conversion/configs/opensora/train/64x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=64, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=1.0, 21 | time_scale=2 / 3, 22 | from_pretrained=None, 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | micro_batch_size=64, 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 250 50 | load = None 51 | 52 | batch_size = 4 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=0.5, 9 | time_scale=1.0, 10 | from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt", 11 | ) 12 | vae = dict( 13 | type="VideoAutoencoderKL", 14 | from_pretrained="stabilityai/sd-vae-ft-ema", 15 | ) 16 | text_encoder = dict( 17 | type="t5", 18 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 19 | model_max_length=120, 20 | ) 21 | scheduler = dict( 22 | type="dpm-solver", 23 | num_sampling_steps=20, 24 | cfg_scale=7.0, 25 | ) 26 | dtype = "bf16" 27 | 28 | # Others 29 | batch_size = 2 30 | seed = 42 31 | prompt_path = "./assets/texts/t2v_samples.txt" 32 | save_dir = "./samples/samples/" 33 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/inference/1x1024MS.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (1920, 512) 4 | multi_resolution = "PixArtMS" 5 | 6 | # Define model 7 | model = dict( 8 | type="PixArtMS-XL/2", 9 | space_scale=2.0, 10 | time_scale=1.0, 11 | no_temporal_pos_emb=True, 12 | from_pretrained="PixArt-XL-2-1024-MS.pth", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | ) 18 | text_encoder = dict( 19 | type="t5", 20 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 21 | model_max_length=120, 22 | ) 23 | scheduler = dict( 24 | type="dpm-solver", 25 | num_sampling_steps=20, 26 | cfg_scale=7.0, 27 | ) 28 | dtype = "bf16" 29 | 30 | # Others 31 | batch_size = 2 32 | seed = 42 33 | prompt_path = "./assets/texts/t2i_samples.txt" 34 | save_dir = "./samples/samples/" 35 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/inference/1x20481B.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (2560, 1536) 4 | # image_size = (2048, 2048) 5 | 6 | model = dict( 7 | type="PixArt-1B/2", 8 | from_pretrained="PixArt-1B-2.pth", 9 | space_scale=4, 10 | no_temporal_pos_emb=True, 11 | enable_flash_attn=True, 12 | enable_layernorm_kernel=True, 13 | base_size=2048 // 8, 14 | ) 15 | vae = dict( 16 | type="VideoAutoencoderKL", 17 | from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", 18 | subfolder="vae", 19 | ) 20 | text_encoder = dict( 21 | type="t5", 22 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 23 | model_max_length=300, 24 | ) 25 | scheduler = dict( 26 | type="dpm-solver", 27 | num_sampling_steps=14, 28 | cfg_scale=4.5, 29 | ) 30 | dtype = "bf16" 31 | 32 | # Others 33 | batch_size = 1 34 | seed = 42 35 | prompt_path = "./assets/texts/t2i_sigma.txt" 36 | save_dir = "./samples/samples/" 37 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/inference/1x2048MS.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (2560, 1536) 4 | # image_size = (2048, 2048) 5 | 6 | model = dict( 7 | type="PixArt-XL/2", 8 | from_pretrained="PixArt-Sigma-XL-2-2K-MS.pth", 9 | space_scale=4, 10 | no_temporal_pos_emb=True, 11 | enable_flash_attn=True, 12 | enable_layernorm_kernel=True, 13 | base_size=2048 // 8, 14 | ) 15 | vae = dict( 16 | type="VideoAutoencoderKL", 17 | from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", 18 | subfolder="vae", 19 | ) 20 | text_encoder = dict( 21 | type="t5", 22 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 23 | model_max_length=300, 24 | ) 25 | scheduler = dict( 26 | type="dpm-solver", 27 | num_sampling_steps=14, 28 | cfg_scale=4.5, 29 | ) 30 | dtype = "bf16" 31 | 32 | # Others 33 | batch_size = 1 34 | seed = 42 35 | prompt_path = "./assets/texts/t2i_sigma.txt" 36 | save_dir = "./samples/samples/" 37 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/inference/1x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | no_temporal_pos_emb=True, 11 | from_pretrained="PixArt-XL-2-256x256.pth", 12 | ) 13 | vae = dict( 14 | type="VideoAutoencoderKL", 15 | from_pretrained="stabilityai/sd-vae-ft-ema", 16 | ) 17 | text_encoder = dict( 18 | type="t5", 19 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 20 | model_max_length=120, 21 | ) 22 | scheduler = dict( 23 | type="dpm-solver", 24 | num_sampling_steps=20, 25 | cfg_scale=7.0, 26 | ) 27 | dtype = "bf16" 28 | 29 | # Others 30 | batch_size = 2 31 | seed = 42 32 | prompt_path = "./assets/texts/t2i_samples.txt" 33 | save_dir = "./samples/samples/" 34 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/inference/1x512x512-rflow.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | no_temporal_pos_emb=True, 11 | from_pretrained="PRETRAINED_MODEL", 12 | ) 13 | vae = dict( 14 | type="VideoAutoencoderKL", 15 | from_pretrained="stabilityai/sd-vae-ft-ema", 16 | ) 17 | text_encoder = dict( 18 | type="t5", 19 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 20 | model_max_length=120, 21 | ) 22 | scheduler = dict( 23 | type="rflow", 24 | num_sampling_steps=20, 25 | cfg_scale=7.0, 26 | ) 27 | dtype = "bf16" 28 | 29 | # prompt_path = "./assets/texts/t2i_samples.txt" 30 | prompt = [ 31 | "Pirate ship trapped in a cosmic maelstrom nebula.", 32 | "A small cactus with a happy face in the Sahara desert.", 33 | "A small cactus with a sad face in the Sahara desert.", 34 | ] 35 | 36 | # Others 37 | batch_size = 2 38 | seed = 42 39 | save_dir = "./outputs/samples2/" 40 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/inference/1x512x512.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | no_temporal_pos_emb=True, 11 | from_pretrained="PixArt-XL-2-512x512.pth", 12 | ) 13 | vae = dict( 14 | type="VideoAutoencoderKL", 15 | from_pretrained="stabilityai/sd-vae-ft-ema", 16 | ) 17 | text_encoder = dict( 18 | type="t5", 19 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 20 | model_max_length=120, 21 | ) 22 | scheduler = dict( 23 | type="dpm-solver", 24 | num_sampling_steps=20, 25 | cfg_scale=7.0, 26 | ) 27 | dtype = "bf16" 28 | 29 | # prompt_path = "./assets/texts/t2i_samples.txt" 30 | prompt = [ 31 | "Pirate ship trapped in a cosmic maelstrom nebula.", 32 | "A small cactus with a happy face in the Sahara desert.", 33 | "A small cactus with a sad face in the Sahara desert.", 34 | ] 35 | 36 | # Others 37 | batch_size = 2 38 | seed = 42 39 | save_dir = "./samples/samples/" 40 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | ) 30 | text_encoder = dict( 31 | type="t5", 32 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 33 | model_max_length=120, 34 | shardformer=True, 35 | ) 36 | scheduler = dict( 37 | type="iddpm", 38 | timestep_respacing="", 39 | ) 40 | 41 | # Others 42 | seed = 42 43 | outputs = "outputs" 44 | wandb = False 45 | 46 | epochs = 1000 47 | log_every = 10 48 | ckpt_every = 1000 49 | load = None 50 | 51 | batch_size = 8 52 | lr = 2e-5 53 | grad_clip = 1.0 54 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/train/1x2048x2048.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path="/home/zhaowangbo/data/csv/image-v1_1_ext_noempty_rcp_clean_info.csv", 5 | num_frames=1, 6 | frame_interval=3, 7 | image_size=(2048, 2048), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-1B/2", 20 | space_scale=4.0, 21 | no_temporal_pos_emb=True, 22 | from_pretrained="PixArt-1B-2.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", 30 | subfolder="vae", 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=300, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 1000 50 | load = None 51 | 52 | batch_size = 4 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/train/1x512x512-rflow.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=1, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-XL/2", 20 | space_scale=1.0, 21 | time_scale=1.0, 22 | no_temporal_pos_emb=True, 23 | # from_pretrained="PixArt-XL-2-512x512.pth", 24 | from_pretrained="PRETRAINED_MODEL", 25 | enable_flash_attn=True, 26 | enable_layernorm_kernel=True, 27 | ) 28 | vae = dict( 29 | type="VideoAutoencoderKL", 30 | from_pretrained="stabilityai/sd-vae-ft-ema", 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=120, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="rflow", 40 | # timestep_respacing="", 41 | ) 42 | 43 | # Others 44 | seed = 42 45 | outputs = "outputs" 46 | wandb = True 47 | 48 | epochs = 2 49 | log_every = 10 50 | ckpt_every = 1000 51 | load = None 52 | 53 | batch_size = 64 54 | lr = 2e-5 55 | grad_clip = 1.0 56 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/train/1x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=1, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-XL/2", 20 | space_scale=1.0, 21 | time_scale=1.0, 22 | no_temporal_pos_emb=True, 23 | from_pretrained="PixArt-XL-2-512x512.pth", 24 | enable_flash_attn=True, 25 | enable_layernorm_kernel=True, 26 | ) 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="stabilityai/sd-vae-ft-ema", 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 1000 50 | load = None 51 | 52 | batch_size = 32 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /Device_conversion/configs/pixart/train/64x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=64, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | 18 | # Define model 19 | model = dict( 20 | type="PixArt-XL/2", 21 | space_scale=1.0, 22 | time_scale=2 / 3, 23 | from_pretrained=None, 24 | enable_flash_attn=True, 25 | enable_layernorm_kernel=True, 26 | ) 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="stabilityai/sd-vae-ft-ema", 30 | micro_batch_size=128, 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=120, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="iddpm", 40 | timestep_respacing="", 41 | ) 42 | 43 | # Others 44 | seed = 42 45 | outputs = "outputs" 46 | wandb = False 47 | 48 | epochs = 1000 49 | log_every = 10 50 | ckpt_every = 250 51 | load = None 52 | 53 | batch_size = 4 54 | lr = 2e-5 55 | grad_clip = 1.0 56 | -------------------------------------------------------------------------------- /Device_conversion/configs/vae/inference/image.py: -------------------------------------------------------------------------------- 1 | image_size = (256, 256) 2 | num_frames = 1 3 | 4 | dtype = "bf16" 5 | batch_size = 1 6 | seed = 42 7 | save_dir = "samples/vae_video" 8 | cal_stats = True 9 | log_stats_every = 100 10 | 11 | # Define dataset 12 | dataset = dict( 13 | type="VideoTextDataset", 14 | data_path=None, 15 | num_frames=num_frames, 16 | image_size=image_size, 17 | ) 18 | num_samples = 100 19 | num_workers = 4 20 | 21 | # Define model 22 | model = dict( 23 | type="OpenSoraVAE_V1_2", 24 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 25 | micro_frame_size=None, 26 | micro_batch_size=4, 27 | cal_loss=True, 28 | ) 29 | 30 | # loss weights 31 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 32 | kl_loss_weight = 1e-6 33 | -------------------------------------------------------------------------------- /Device_conversion/configs/vae/inference/video.py: -------------------------------------------------------------------------------- 1 | image_size = (256, 256) 2 | num_frames = 51 3 | 4 | dtype = "bf16" 5 | batch_size = 1 6 | seed = 42 7 | save_dir = "samples/vae_video" 8 | cal_stats = True 9 | log_stats_every = 100 10 | 11 | # Define dataset 12 | dataset = dict( 13 | type="VideoTextDataset", 14 | data_path=None, 15 | num_frames=num_frames, 16 | image_size=image_size, 17 | ) 18 | num_samples = 100 19 | num_workers = 4 20 | 21 | # Define model 22 | model = dict( 23 | type="OpenSoraVAE_V1_2", 24 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 25 | micro_frame_size=None, 26 | micro_batch_size=4, 27 | cal_loss=True, 28 | ) 29 | 30 | # loss weights 31 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 32 | kl_loss_weight = 1e-6 33 | -------------------------------------------------------------------------------- /Device_conversion/configs/vae/train/stage1.py: -------------------------------------------------------------------------------- 1 | num_frames = 17 2 | image_size = (256, 256) 3 | 4 | # Define dataset 5 | dataset = dict( 6 | type="VideoTextDataset", 7 | data_path=None, 8 | num_frames=num_frames, 9 | frame_interval=1, 10 | image_size=image_size, 11 | ) 12 | 13 | # Define acceleration 14 | num_workers = 16 15 | dtype = "bf16" 16 | grad_checkpoint = True 17 | plugin = "zero2" 18 | 19 | # Define model 20 | model = dict( 21 | type="OpenSoraVAE_V1_2", 22 | freeze_vae_2d=True, 23 | from_pretrained=None, 24 | cal_loss=True, 25 | ) 26 | 27 | # loss weights 28 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 29 | kl_loss_weight = 1e-6 30 | 31 | mixed_strategy = "mixed_video_image" 32 | mixed_image_ratio = 0.2 33 | use_real_rec_loss = False 34 | use_z_rec_loss = True 35 | use_image_identity_loss = True 36 | 37 | # Others 38 | seed = 42 39 | outputs = "outputs/vae_stage1" 40 | wandb = False 41 | 42 | epochs = 100 # NOTE: adjust accordingly w.r.t dataset size 43 | log_every = 1 44 | ckpt_every = 1000 45 | load = None 46 | 47 | batch_size = 1 48 | lr = 1e-5 49 | grad_clip = 1.0 50 | -------------------------------------------------------------------------------- /Device_conversion/configs/vae/train/stage2.py: -------------------------------------------------------------------------------- 1 | num_frames = 17 2 | image_size = (256, 256) 3 | 4 | # Define dataset 5 | dataset = dict( 6 | type="VideoTextDataset", 7 | data_path=None, 8 | num_frames=num_frames, 9 | frame_interval=1, 10 | image_size=image_size, 11 | ) 12 | 13 | # Define acceleration 14 | num_workers = 16 15 | dtype = "bf16" 16 | grad_checkpoint = True 17 | plugin = "zero2" 18 | 19 | # Define model 20 | model = dict( 21 | type="OpenSoraVAE_V1_2", 22 | freeze_vae_2d=False, 23 | from_pretrained="outputs/vae_stage1", 24 | cal_loss=True, 25 | ) 26 | 27 | # loss weights 28 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 29 | kl_loss_weight = 1e-6 30 | 31 | mixed_strategy = "mixed_video_image" 32 | mixed_image_ratio = 0.2 33 | use_real_rec_loss = False 34 | use_z_rec_loss = True 35 | use_image_identity_loss = False 36 | 37 | # Others 38 | seed = 42 39 | outputs = "outputs/vae_stage2" 40 | wandb = False 41 | 42 | epochs = 100 # NOTE: adjust accordingly w.r.t dataset size 43 | log_every = 1 44 | ckpt_every = 1000 45 | load = None 46 | 47 | batch_size = 1 48 | lr = 1e-5 49 | grad_clip = 1.0 50 | -------------------------------------------------------------------------------- /Device_conversion/configs/vae/train/stage3.py: -------------------------------------------------------------------------------- 1 | num_frames = 33 2 | image_size = (256, 256) 3 | 4 | # Define dataset 5 | dataset = dict( 6 | type="VideoTextDataset", 7 | data_path=None, 8 | num_frames=num_frames, 9 | frame_interval=1, 10 | image_size=image_size, 11 | ) 12 | 13 | # Define acceleration 14 | num_workers = 16 15 | dtype = "bf16" 16 | grad_checkpoint = True 17 | plugin = "zero2" 18 | 19 | # Define model 20 | model = dict( 21 | type="OpenSoraVAE_V1_2", 22 | freeze_vae_2d=False, 23 | from_pretrained="outputs/vae_stage2", 24 | cal_loss=True, 25 | ) 26 | 27 | # loss weights 28 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 29 | kl_loss_weight = 1e-6 30 | 31 | mixed_strategy = "mixed_video_random" 32 | use_real_rec_loss = True 33 | use_z_rec_loss = False 34 | use_image_identity_loss = False 35 | 36 | # Others 37 | seed = 42 38 | outputs = "outputs/vae_stage3" 39 | wandb = False 40 | 41 | epochs = 100 # NOTE: adjust accordingly w.r.t dataset size 42 | log_every = 1 43 | ckpt_every = 1000 44 | load = None 45 | 46 | batch_size = 1 47 | lr = 1e-5 48 | grad_clip = 1.0 49 | -------------------------------------------------------------------------------- /Device_conversion/coreml-export/stdit3/fps.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/coreml-export/stdit3/fps.pkl -------------------------------------------------------------------------------- /Device_conversion/coreml-export/stdit3/mask.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/coreml-export/stdit3/mask.pkl -------------------------------------------------------------------------------- /Device_conversion/coreml-export/stdit3/y.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/coreml-export/stdit3/y.pkl -------------------------------------------------------------------------------- /Device_conversion/coreml-export/t5/configs/T5BlockConfig.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/coreml-export/t5/configs/T5BlockConfig.pkl -------------------------------------------------------------------------------- /Device_conversion/coreml-export/t5/quantization.py: -------------------------------------------------------------------------------- 1 | import coremltools as ct 2 | from coremltools.models.neural_network import quantization_utils 3 | 4 | mlmodel = ct.models.MLModel("stdit3.mlpackage", compute_units=ct.ComputeUnit.CPU_ONLY) 5 | 6 | op_config = ct.optimize.coreml.OpPalettizerConfig( 7 | mode="kmeans", 8 | nbits=8, 9 | ) 10 | 11 | config = ct.optimize.coreml.OptimizationConfig( 12 | global_config=op_config, 13 | op_type_configs={ 14 | "gather": None # avoid quantizing the embedding table 15 | } 16 | ) 17 | 18 | model = ct.optimize.coreml.palettize_weights(mlmodel, config=config).save("quantize/stdit3.mlpackage") -------------------------------------------------------------------------------- /Device_conversion/coreml-export/t5/y_embedding.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/coreml-export/t5/y_embedding.pkl -------------------------------------------------------------------------------- /Device_conversion/opensora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/opensora/__init__.py -------------------------------------------------------------------------------- /Device_conversion/opensora/acceleration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/opensora/acceleration/__init__.py -------------------------------------------------------------------------------- /Device_conversion/opensora/acceleration/checkpoint.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable 2 | 3 | import torch.nn as nn 4 | from torch.utils.checkpoint import checkpoint, checkpoint_sequential 5 | 6 | 7 | def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1): 8 | assert isinstance(model, nn.Module) 9 | 10 | def set_attr(module): 11 | module.grad_checkpointing = True 12 | module.fp32_attention = use_fp32_attention 13 | module.grad_checkpointing_step = gc_step 14 | 15 | model.apply(set_attr) 16 | 17 | 18 | def auto_grad_checkpoint(module, *args, **kwargs): 19 | if getattr(module, "grad_checkpointing", False): 20 | if not isinstance(module, Iterable): 21 | return checkpoint(module, *args, use_reentrant=False, **kwargs) 22 | gc_step = module[0].grad_checkpointing_step 23 | return checkpoint_sequential(module, gc_step, *args, use_reentrant=False, **kwargs) 24 | return module(*args, **kwargs) 25 | -------------------------------------------------------------------------------- /Device_conversion/opensora/acceleration/parallel_states.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as dist 2 | 3 | _GLOBAL_PARALLEL_GROUPS = dict() 4 | 5 | 6 | def set_data_parallel_group(group: dist.ProcessGroup): 7 | _GLOBAL_PARALLEL_GROUPS["data"] = group 8 | 9 | 10 | def get_data_parallel_group(): 11 | return _GLOBAL_PARALLEL_GROUPS.get("data", dist.group.WORLD) 12 | 13 | 14 | def set_sequence_parallel_group(group: dist.ProcessGroup): 15 | _GLOBAL_PARALLEL_GROUPS["sequence"] = group 16 | 17 | 18 | def get_sequence_parallel_group(): 19 | return _GLOBAL_PARALLEL_GROUPS.get("sequence", None) 20 | -------------------------------------------------------------------------------- /Device_conversion/opensora/acceleration/shardformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/opensora/acceleration/shardformer/__init__.py -------------------------------------------------------------------------------- /Device_conversion/opensora/acceleration/shardformer/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/opensora/acceleration/shardformer/modeling/__init__.py -------------------------------------------------------------------------------- /Device_conversion/opensora/acceleration/shardformer/policy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/opensora/acceleration/shardformer/policy/__init__.py -------------------------------------------------------------------------------- /Device_conversion/opensora/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import IMG_FPS, BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset 2 | from .utils import get_transforms_image, get_transforms_video, is_img, is_vid, save_sample 3 | -------------------------------------------------------------------------------- /Device_conversion/opensora/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .dit import * 2 | from .latte import * 3 | from .pixart import * 4 | from .stdit import * 5 | from .text_encoder import * 6 | from .vae import * 7 | -------------------------------------------------------------------------------- /Device_conversion/opensora/models/dit/__init__.py: -------------------------------------------------------------------------------- 1 | from .dit import DiT, DiT_XL_2, DiT_XL_2x2 2 | -------------------------------------------------------------------------------- /Device_conversion/opensora/models/latte/__init__.py: -------------------------------------------------------------------------------- 1 | from .latte import Latte, Latte_XL_2, Latte_XL_2x2 2 | -------------------------------------------------------------------------------- /Device_conversion/opensora/models/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/opensora/models/layers/__init__.py -------------------------------------------------------------------------------- /Device_conversion/opensora/models/pixart/__init__.py: -------------------------------------------------------------------------------- 1 | from .pixart import PixArt, PixArt_1B_2, PixArt_XL_2 2 | from .pixart_sigma import PixArt_Sigma_XL_2 3 | -------------------------------------------------------------------------------- /Device_conversion/opensora/models/stdit/__init__.py: -------------------------------------------------------------------------------- 1 | from .stdit import STDiT 2 | from .stdit2 import STDiT2 3 | from .stdit3 import STDiT3 4 | -------------------------------------------------------------------------------- /Device_conversion/opensora/models/text_encoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .classes import ClassEncoder 2 | from .clip import ClipEncoder 3 | from .t5 import T5Encoder 4 | -------------------------------------------------------------------------------- /Device_conversion/opensora/models/text_encoder/classes.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from opensora.registry import MODELS 4 | 5 | 6 | @MODELS.register_module("classes") 7 | class ClassEncoder: 8 | def __init__(self, num_classes, model_max_length=None, device="cuda", dtype=torch.float): 9 | self.num_classes = num_classes 10 | self.y_embedder = None 11 | 12 | self.model_max_length = model_max_length 13 | self.output_dim = None 14 | self.device = device 15 | 16 | def encode(self, text): 17 | return dict(y=torch.tensor([int(t) for t in text]).to(self.device)) 18 | 19 | def null(self, n): 20 | return torch.tensor([self.num_classes] * n).to(self.device) 21 | -------------------------------------------------------------------------------- /Device_conversion/opensora/models/vae/__init__.py: -------------------------------------------------------------------------------- 1 | from .discriminator import DISCRIMINATOR_3D 2 | from .vae import VideoAutoencoderKL, VideoAutoencoderKLTemporalDecoder 3 | from .vae_temporal import VAE_Temporal 4 | -------------------------------------------------------------------------------- /Device_conversion/opensora/registry.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import torch.nn as nn 4 | from mmengine.registry import Registry 5 | 6 | 7 | def build_module(module, builder, **kwargs): 8 | """Build module from config or return the module itself. 9 | 10 | Args: 11 | module (Union[dict, nn.Module]): The module to build. 12 | builder (Registry): The registry to build module. 13 | *args, **kwargs: Arguments passed to build function. 14 | 15 | Returns: 16 | Any: The built module. 17 | """ 18 | if module is None: 19 | return None 20 | if isinstance(module, dict): 21 | cfg = deepcopy(module) 22 | for k, v in kwargs.items(): 23 | cfg[k] = v 24 | return builder.build(cfg) 25 | elif isinstance(module, nn.Module): 26 | return module 27 | elif module is None: 28 | return None 29 | else: 30 | raise TypeError(f"Only support dict and nn.Module, but got {type(module)}.") 31 | 32 | 33 | MODELS = Registry( 34 | "model", 35 | locations=["opensora.models"], 36 | ) 37 | 38 | SCHEDULERS = Registry( 39 | "scheduler", 40 | locations=["opensora.schedulers"], 41 | ) 42 | 43 | DATASETS = Registry( 44 | "dataset", 45 | locations=["opensora.datasets"], 46 | ) 47 | -------------------------------------------------------------------------------- /Device_conversion/opensora/schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | from .dpms import DPMS 2 | from .iddpm import IDDPM 3 | from .rf import RFLOW 4 | -------------------------------------------------------------------------------- /Device_conversion/opensora/schedulers/dpms/__init__.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch 4 | 5 | from opensora.registry import SCHEDULERS 6 | 7 | from .dpm_solver import DPMS 8 | 9 | 10 | @SCHEDULERS.register_module("dpm-solver") 11 | class DPM_SOLVER: 12 | def __init__(self, num_sampling_steps=None, cfg_scale=4.0): 13 | self.num_sampling_steps = num_sampling_steps 14 | self.cfg_scale = cfg_scale 15 | 16 | def sample( 17 | self, 18 | model, 19 | text_encoder, 20 | z, 21 | prompts, 22 | device, 23 | additional_args=None, 24 | mask=None, 25 | progress=True, 26 | ): 27 | assert mask is None, "mask is not supported in dpm-solver" 28 | n = len(prompts) 29 | model_args = text_encoder.encode(prompts) 30 | y = model_args.pop("y") 31 | null_y = text_encoder.null(n) 32 | if additional_args is not None: 33 | model_args.update(additional_args) 34 | 35 | dpms = DPMS( 36 | partial(forward_with_dpmsolver, model), 37 | condition=y, 38 | uncondition=null_y, 39 | cfg_scale=self.cfg_scale, 40 | model_kwargs=model_args, 41 | ) 42 | samples = dpms.sample( 43 | z, 44 | steps=self.num_sampling_steps, 45 | order=2, 46 | skip_type="time_uniform", 47 | method="multistep", 48 | progress=progress, 49 | ) 50 | return samples 51 | 52 | 53 | def forward_with_dpmsolver(self, x, timestep, y, **kwargs): 54 | """ 55 | dpm solver donnot need variance prediction 56 | """ 57 | # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb 58 | model_out = self.forward(x, timestep, y, **kwargs) 59 | return model_out.chunk(2, dim=1)[0] 60 | -------------------------------------------------------------------------------- /Device_conversion/opensora/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Device_conversion/opensora/utils/__init__.py -------------------------------------------------------------------------------- /Device_conversion/opensora/utils/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | from torch.optim.lr_scheduler import _LRScheduler 2 | 3 | 4 | class LinearWarmupLR(_LRScheduler): 5 | """Linearly warmup learning rate and then linearly decay. 6 | 7 | Args: 8 | optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. 9 | warmup_steps (int, optional): Number of warmup steps, defaults to 0 10 | last_step (int, optional): The index of last step, defaults to -1. When last_step=-1, 11 | the schedule is started from the beginning or When last_step=-1, sets initial lr as lr. 12 | """ 13 | 14 | def __init__(self, optimizer, warmup_steps: int = 0, last_epoch: int = -1): 15 | self.warmup_steps = warmup_steps 16 | super().__init__(optimizer, last_epoch=last_epoch) 17 | 18 | def get_lr(self): 19 | if self.last_epoch < self.warmup_steps: 20 | return [(self.last_epoch + 1) / (self.warmup_steps + 1) * lr for lr in self.base_lrs] 21 | else: 22 | return self.base_lrs 23 | -------------------------------------------------------------------------------- /Device_conversion/requirements/requirements-cu121.txt: -------------------------------------------------------------------------------- 1 | torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121 2 | torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu121 3 | -------------------------------------------------------------------------------- /Device_conversion/requirements/requirements-data.txt: -------------------------------------------------------------------------------- 1 | gdown>=5.2.0 2 | 3 | # [caption llava] 4 | ninja>=1.11.1.1 5 | shortuuid>=1.0.13 6 | markdown2[all] 7 | scikit-learn>=1.4.2 8 | einops-exts>=0.0.4 9 | 10 | # [camera_motion] 11 | decord==0.6.0 12 | ptvsd==4.3.2 13 | imageio-ffmpeg>=0.4.9 14 | 15 | # [datasets] 16 | ffmpeg-python==0.2.0 17 | lingua-language-detector==2.0.2 18 | 19 | # [frame interpolation] 20 | imageio>=2.34.1 21 | 22 | # [aesthetic] 23 | setuptools==68.2.2 24 | clip @ git+https://github.com/openai/CLIP.git 25 | 26 | # [ocr] 27 | mmcv==2.1.0 28 | mmdet==3.1.0 29 | mmocr==1.0.1 30 | detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992 31 | -------------------------------------------------------------------------------- /Device_conversion/requirements/requirements-eval.txt: -------------------------------------------------------------------------------- 1 | # [vbench] 2 | detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992 3 | imageio>=2.34.1 4 | pyiqa==0.1.10 5 | scikit-learn>=1.4.2 6 | scikit-image>=0.20.0 7 | lvis==0.5.3 8 | boto3>=1.34.113 9 | easydict>=1.9 10 | fairscale>=0.4.13 11 | 12 | # [vae] 13 | decord==0.6.0 14 | pytorchvideo==0.1.5 15 | lpips==0.1.4 16 | -------------------------------------------------------------------------------- /Device_conversion/requirements/requirements-vae.txt: -------------------------------------------------------------------------------- 1 | beartype==0.18.5 2 | einops==0.8.0 3 | einops-exts==0.0.4 4 | opencv-python==4.9.0.80 5 | pillow==10.3.0 6 | -------------------------------------------------------------------------------- /Device_conversion/requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai==0.3.7 2 | mmengine>=0.10.3 3 | pandas>=2.0.3 4 | timm==0.9.16 5 | rotary_embedding_torch==0.5.3 6 | ftfy>=6.2.0 # for t5 7 | diffusers==0.27.2 # for vae 8 | accelerate==0.29.2 # for t5 9 | av>=12.0.0 # for video loading 10 | 11 | # [gradio] 12 | gradio>=4.26.0 13 | spaces>=0.28.3 14 | 15 | # [notebook] 16 | ipykernel>=6.29.4 17 | ipywidgets>=8.1.2 18 | 19 | # [training] 20 | wandb>=0.17.0 21 | tensorboard>=2.14.0 22 | pandarallel>=1.6.5 23 | pyarrow>=16.1.0 # for parquet 24 | 25 | # [dev] 26 | pre-commit>=3.5.0 27 | openai 28 | -------------------------------------------------------------------------------- /Figures/On-device-Sora-Example1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Figures/On-device-Sora-Example1.gif -------------------------------------------------------------------------------- /Figures/overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Figures/overview.jpg -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 EAI LAB at UNIST 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Modded_Open_Sora/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM hpcaitech/pytorch-cuda:2.1.0-12.1.0 2 | 3 | # metainformation 4 | LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/Open-Sora" 5 | LABEL org.opencontainers.image.licenses = "Apache License 2.0" 6 | LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/pytorch-cuda:2.1.0-12.1.0" 7 | 8 | # Set the working directory 9 | WORKDIR /workspace/Open-Sora 10 | # Copy the current directory contents into the container at /workspace/Open-Sora 11 | COPY . . 12 | 13 | # inatall library dependencies 14 | RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y 15 | 16 | # install flash attention 17 | RUN pip install flash-attn --no-build-isolation 18 | 19 | # install apex 20 | RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git 21 | 22 | # install xformers 23 | RUN pip install xformers --index-url https://download.pytorch.org/whl/cu121 24 | 25 | # install this project 26 | RUN pip install -v . 27 | -------------------------------------------------------------------------------- /Modded_Open_Sora/README.md: -------------------------------------------------------------------------------- 1 | # On-Device-Sora -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/sample_16s_224x448.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/sample_16s_224x448.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/sample_16s_320x320.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/sample_16s_320x320.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/sample_16x240x426_9.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/sample_16x240x426_9.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/sample_32x240x426_7.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/sample_32x240x426_7.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/sample_32x480x854_9.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/sample_32x480x854_9.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/sora_16x240x426_26.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/sora_16x240x426_26.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/sora_16x240x426_27.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/sora_16x240x426_27.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/sora_16x240x426_40.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/sora_16x240x426_40.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/sora_16x426x240_24.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/sora_16x426x240_24.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/sora_16x426x240_3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/sora_16x426x240_3.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/v1.2/sample_0002.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/v1.2/sample_0002.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/v1.2/sample_0004.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/v1.2/sample_0004.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/v1.2/sample_0011.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/v1.2/sample_0011.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/v1.2/sample_0013.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/v1.2/sample_0013.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/v1.2/sample_0052.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/v1.2/sample_0052.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/v1.2/sample_0061.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/v1.2/sample_0061.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/v1.2/sample_0087.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/v1.2/sample_0087.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/v1.2/sample_1718.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/v1.2/sample_1718.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/demo/v1.2/sample_1719.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/demo/v1.2/sample_1719.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/condition/cactus-happy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/condition/cactus-happy.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/condition/cactus-sad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/condition/cactus-sad.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/condition/cliff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/condition/cliff.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/condition/ship.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/condition/ship.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/condition/sunset1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/condition/sunset1.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/condition/sunset2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/condition/sunset2.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/condition/wave.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/condition/wave.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/imagenet/train/n01440764/n01440764_10026.JPEG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/imagenet/train/n01440764/n01440764_10026.JPEG -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/imagenet/val/n01440764/ILSVRC2012_val_00000293.JPEG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/imagenet/val/n01440764/ILSVRC2012_val_00000293.JPEG -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/ocr/demo_text_det.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/ocr/demo_text_det.jpg -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/ocr/demo_text_ocr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/ocr/demo_text_ocr.jpg -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/ocr/demo_text_recog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/ocr/demo_text_recog.jpg -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/images/watermark/watermark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/images/watermark/watermark.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/colossal_ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/colossal_ai.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/gradio_advanced.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/gradio_advanced.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/gradio_basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/gradio_basic.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/gradio_option.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/gradio_option.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/icon.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/llava_vs_pllava_sample.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/llava_vs_pllava_sample.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report-03_actions_count.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report-03_actions_count.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report-03_objects_count.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report-03_objects_count.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report-03_video_stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report-03_video_stats.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_3d_vae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_3d_vae.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_arch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_arch.jpg -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_arch_comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_arch_comp.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_bucket.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_bucket.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_caption.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_caption.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_data_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_data_pipeline.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_image_textlen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_image_textlen.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_loss_curve_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_loss_curve_1.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_loss_curve_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_loss_curve_2.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_loss_curve_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_loss_curve_3.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_mask.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_mask_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_mask_config.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_val_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_val_loss.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_vbench_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_vbench_score.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_vid_val_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_vid_val_loss.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_video_duration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_video_duration.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/report_video_textlen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/report_video_textlen.png -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/sample_0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/sample_0.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/sample_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/sample_1.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/sample_2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/sample_2.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/sample_3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/sample_3.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/sample_4.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/sample_4.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/sample_5.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/sample_5.gif -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/readme/sequence_parallelism.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/assets/readme/sequence_parallelism.jpeg -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/VBench/prompts_per_dimension/color.txt: -------------------------------------------------------------------------------- 1 | a red bicycle 2 | a green bicycle 3 | a blue bicycle 4 | a yellow bicycle 5 | an orange bicycle 6 | a purple bicycle 7 | a pink bicycle 8 | a black bicycle 9 | a white bicycle 10 | a red car 11 | a green car 12 | a blue car 13 | a yellow car 14 | an orange car 15 | a purple car 16 | a pink car 17 | a black car 18 | a white car 19 | a red bird 20 | a green bird 21 | a blue bird 22 | a yellow bird 23 | an orange bird 24 | a purple bird 25 | a pink bird 26 | a black bird 27 | a white bird 28 | a black cat 29 | a white cat 30 | an orange cat 31 | a yellow cat 32 | a red umbrella 33 | a green umbrella 34 | a blue umbrella 35 | a yellow umbrella 36 | an orange umbrella 37 | a purple umbrella 38 | a pink umbrella 39 | a black umbrella 40 | a white umbrella 41 | a red suitcase 42 | a green suitcase 43 | a blue suitcase 44 | a yellow suitcase 45 | an orange suitcase 46 | a purple suitcase 47 | a pink suitcase 48 | a black suitcase 49 | a white suitcase 50 | a red bowl 51 | a green bowl 52 | a blue bowl 53 | a yellow bowl 54 | an orange bowl 55 | a purple bowl 56 | a pink bowl 57 | a black bowl 58 | a white bowl 59 | a red chair 60 | a green chair 61 | a blue chair 62 | a yellow chair 63 | an orange chair 64 | a purple chair 65 | a pink chair 66 | a black chair 67 | a white chair 68 | a red clock 69 | a green clock 70 | a blue clock 71 | a yellow clock 72 | an orange clock 73 | a purple clock 74 | a pink clock 75 | a black clock 76 | a white clock 77 | a red vase 78 | a green vase 79 | a blue vase 80 | a yellow vase 81 | an orange vase 82 | a purple vase 83 | a pink vase 84 | a black vase 85 | a white vase 86 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/VBench/prompts_per_dimension/object_class.txt: -------------------------------------------------------------------------------- 1 | a person 2 | a bicycle 3 | a car 4 | a motorcycle 5 | an airplane 6 | a bus 7 | a train 8 | a truck 9 | a boat 10 | a traffic light 11 | a fire hydrant 12 | a stop sign 13 | a parking meter 14 | a bench 15 | a bird 16 | a cat 17 | a dog 18 | a horse 19 | a sheep 20 | a cow 21 | an elephant 22 | a bear 23 | a zebra 24 | a giraffe 25 | a backpack 26 | an umbrella 27 | a handbag 28 | a tie 29 | a suitcase 30 | a frisbee 31 | skis 32 | a snowboard 33 | a sports ball 34 | a kite 35 | a baseball bat 36 | a baseball glove 37 | a skateboard 38 | a surfboard 39 | a tennis racket 40 | a bottle 41 | a wine glass 42 | a cup 43 | a fork 44 | a knife 45 | a spoon 46 | a bowl 47 | a banana 48 | an apple 49 | a sandwich 50 | an orange 51 | broccoli 52 | a carrot 53 | a hot dog 54 | a pizza 55 | a donut 56 | a cake 57 | a chair 58 | a couch 59 | a potted plant 60 | a bed 61 | a dining table 62 | a toilet 63 | a tv 64 | a laptop 65 | a remote 66 | a keyboard 67 | a cell phone 68 | a microwave 69 | an oven 70 | a toaster 71 | a sink 72 | a refrigerator 73 | a book 74 | a clock 75 | a vase 76 | scissors 77 | a teddy bear 78 | a hair drier 79 | a toothbrush 80 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/VBench/prompts_per_dimension/scene.txt: -------------------------------------------------------------------------------- 1 | alley 2 | amusement park 3 | aquarium 4 | arch 5 | art gallery 6 | bathroom 7 | bakery shop 8 | ballroom 9 | bar 10 | barn 11 | basement 12 | beach 13 | bedroom 14 | bridge 15 | botanical garden 16 | cafeteria 17 | campsite 18 | campus 19 | carrousel 20 | castle 21 | cemetery 22 | classroom 23 | cliff 24 | crosswalk 25 | construction site 26 | corridor 27 | courtyard 28 | desert 29 | downtown 30 | driveway 31 | farm 32 | food court 33 | football field 34 | forest road 35 | fountain 36 | gas station 37 | glacier 38 | golf course 39 | indoor gymnasium 40 | harbor 41 | highway 42 | hospital 43 | house 44 | iceberg 45 | industrial area 46 | jail cell 47 | junkyard 48 | kitchen 49 | indoor library 50 | lighthouse 51 | laboratory 52 | mansion 53 | marsh 54 | mountain 55 | indoor movie theater 56 | indoor museum 57 | music studio 58 | nursery 59 | ocean 60 | office 61 | palace 62 | parking lot 63 | pharmacy 64 | phone booth 65 | raceway 66 | restaurant 67 | river 68 | science museum 69 | shower 70 | ski slope 71 | sky 72 | skyscraper 73 | baseball stadium 74 | staircase 75 | street 76 | supermarket 77 | indoor swimming pool 78 | tower 79 | outdoor track 80 | train railway 81 | train station platform 82 | underwater coral reef 83 | valley 84 | volcano 85 | waterfall 86 | windmill 87 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/imagenet_id.txt: -------------------------------------------------------------------------------- 1 | 207 2 | 360 3 | 387 4 | 974 5 | 88 6 | 979 7 | 417 8 | 279 9 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/imagenet_labels.txt: -------------------------------------------------------------------------------- 1 | golden retriever 2 | otter 3 | lesser panda 4 | geyser 5 | macaw 6 | valley 7 | balloon 8 | golden panda 9 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/t2i_samples.txt: -------------------------------------------------------------------------------- 1 | A small cactus with a happy face in the Sahara desert. 2 | Bright scene, aerial view,ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens. 3 | Nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph. 4 | Poster of a mechanical cat, techical Schematics viewed from front. 5 | Luffy from ONEPIECE, handsome face, fantasy. 6 | Real beautiful woman. 7 | A alpaca made of colorful building blocks, cyberpunk. 8 | artistic 9 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/t2i_sigma.txt: -------------------------------------------------------------------------------- 1 | Eiffel Tower was Made up of more than 2 million translucent straws to look like a cloud, with the bell tower at the top of the building, Michel installed huge foam-making machines in the forest to blow huge amounts of unpredictable wet clouds in the building's classic architecture. 2 | A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures. 3 | Full body shot, a French woman, Photography, French Streets background, backlighting, rim light, Fujifilm. 4 | Close-up photos of models, hazy light and shadow, laser metal hair accessories, soft and beautiful, light gold pupils, white eyelashes, low saturation, real skin details, clear pores and fine lines, light reflection and refraction, ultra-clear, cinematography, award-winning works. 5 | A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in. 6 | Lego model, future rocket station, intricate details, high resolution, unreal engine, UHD 7 | One giant, sharp, metal square mirror in the center of the frame, four young people on the foreground, background sunny palm oil planation, tropical, realistic style, photography, nostalgic, green tone, mysterious, dreamy, bright color. 8 | Modern luxury contemporary luxury home interiors house, in the style of mimicking ruined materials, ray tracing, haunting houses, and stone, capture the essence of nature, gray and bronze, dynamic outdoor shots. 9 | Over the shoulder game perspective, game screen of Diablo 4, Inside the gorgeous palace is the wet ground, The necromancer knelt before the king, and a horde of skeletons he summoned stood at his side, cinematic light. 10 | A curvy timber house near a sea, designed by Zaha Hadid, represent the image of a cold, modern architecture, at night, white lighting, highly detailed. 11 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/t2v_car.txt: -------------------------------------------------------------------------------- 1 | |0|A car driving on the in forest.|2|A car driving in the desert.|4|A car driving near the coast.|6|A car driving in the city.|8|A car driving near a mountain.|10|A car driving on the surface of a river.|12|A car driving on the surface of the earch.|14|A car driving in the universe.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,16,0.4"} 2 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/t2v_latte.txt: -------------------------------------------------------------------------------- 1 | Yellow and black tropical fish dart through the sea. 2 | An epic tornado attacking above aglowing city at night. 3 | Slow pan upward of blazing oak fire in an indoor fireplace. 4 | a cat wearing sunglasses and working as a lifeguard at pool. 5 | Sunset over the sea. 6 | A dog in astronaut suit and sunglasses floating in space. 7 | A astronaut in flying in space, 4k, high resolution 8 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/t2v_ref.txt: -------------------------------------------------------------------------------- 1 | Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway. 2 | In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave. 3 | Pirate ship in a cosmic maelstrom nebula. 4 | Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway. 5 | A sad small cactus with in the Sahara desert becomes happy. 6 | A car driving on a road in the middle of a desert. 7 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/t2v_short.txt: -------------------------------------------------------------------------------- 1 | A fat rabbit wearing a purple robe walking through a fantasy landscape 2 | Waves crashing against a lone lighthouse, ominous lighting 3 | A mystical forest showcasing the adventures of travelers who enter 4 | A blue-haired mage singing 5 | A surreal landscape with floating islands and waterfalls in the sky craft 6 | A blue bird standing in water 7 | A young man walks alone by the seaside 8 | Pink rose on a glass surface with droplets, close-up 9 | Drove viewpoint, a subway train coming out of a tunnel 10 | Space with all planets green and pink color with background of bright white stars 11 | A city floating in an astral space, with stars and nebulae 12 | Sunrise on top of a high-rise building 13 | Pink and cyan powder explosions 14 | Deers in the woods gaze into the camera under the sunlight 15 | In a flash of lightning, a wizard appeared from thin air, his long robes billowing in the wind 16 | A futuristic cyberpunk cityscape at night with towering neon-lit skyscrapers 17 | A scene where the trees, flowers, and animals come together to create a symphony of nature 18 | A ghostly ship sailing through the clouds, navigating through a sea under a moonlit sky 19 | A sunset with beautiful beach 20 | A young man walking alone in the forest 21 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/ucf101_id.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | -------------------------------------------------------------------------------- /Modded_Open_Sora/assets/texts/ucf101_labels.txt: -------------------------------------------------------------------------------- 1 | Apply Eye Makeup 2 | Apply Lipstick 3 | Archery 4 | Baby Crawling 5 | Balance Beam 6 | Band Marching 7 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/dit/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="DiT-XL/2", 8 | condition="text", 9 | from_pretrained="PRETRAINED_MODEL", 10 | ) 11 | vae = dict( 12 | type="VideoAutoencoderKL", 13 | from_pretrained="stabilityai/sd-vae-ft-ema", 14 | ) 15 | text_encoder = dict( 16 | type="clip", 17 | from_pretrained="openai/clip-vit-base-patch32", 18 | model_max_length=77, 19 | ) 20 | scheduler = dict( 21 | type="dpm-solver", 22 | num_sampling_steps=20, 23 | cfg_scale=4.0, 24 | ) 25 | dtype = "bf16" 26 | 27 | # Others 28 | batch_size = 2 29 | seed = 42 30 | prompt_path = "./assets/texts/ucf101_labels.txt" 31 | save_dir = "./samples/samples/" 32 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/dit/inference/1x256x256-class.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="DiT-XL/2", 8 | no_temporal_pos_emb=True, 9 | condition="label_1000", 10 | from_pretrained="DiT-XL-2-256x256.pt", 11 | ) 12 | vae = dict( 13 | type="VideoAutoencoderKL", 14 | from_pretrained="stabilityai/sd-vae-ft-ema", 15 | ) 16 | text_encoder = dict( 17 | type="classes", 18 | num_classes=1000, 19 | ) 20 | scheduler = dict( 21 | type="dpm-solver", 22 | num_sampling_steps=20, 23 | cfg_scale=4.0, 24 | ) 25 | dtype = "bf16" 26 | 27 | # Others 28 | batch_size = 2 29 | seed = 42 30 | prompt_path = "./assets/texts/imagenet_id.txt" 31 | save_dir = "./samples/samples/" 32 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/dit/inference/1x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="DiT-XL/2", 8 | no_temporal_pos_emb=True, 9 | condition="text", 10 | from_pretrained="PRETRAINED_MODEL", 11 | ) 12 | vae = dict( 13 | type="VideoAutoencoderKL", 14 | from_pretrained="stabilityai/sd-vae-ft-ema", 15 | ) 16 | text_encoder = dict( 17 | type="clip", 18 | from_pretrained="openai/clip-vit-base-patch32", 19 | model_max_length=77, 20 | ) 21 | scheduler = dict( 22 | type="dpm-solver", 23 | num_sampling_steps=20, 24 | cfg_scale=4.0, 25 | ) 26 | dtype = "bf16" 27 | 28 | # Others 29 | batch_size = 2 30 | seed = 42 31 | prompt_path = "./assets/texts/imagenet_labels.txt" 32 | save_dir = "./samples/samples/" 33 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/dit/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="DiT-XL/2", 20 | from_pretrained="DiT-XL-2-256x256.pt", 21 | enable_flash_attn=True, 22 | enable_layernorm_kernel=True, 23 | ) 24 | vae = dict( 25 | type="VideoAutoencoderKL", 26 | from_pretrained="stabilityai/sd-vae-ft-ema", 27 | ) 28 | text_encoder = dict( 29 | type="clip", 30 | from_pretrained="openai/clip-vit-base-patch32", 31 | model_max_length=77, 32 | ) 33 | scheduler = dict( 34 | type="iddpm", 35 | timestep_respacing="", 36 | ) 37 | 38 | # Others 39 | seed = 42 40 | outputs = "outputs" 41 | wandb = False 42 | 43 | epochs = 1000 44 | log_every = 10 45 | ckpt_every = 1000 46 | load = None 47 | 48 | batch_size = 8 49 | lr = 2e-5 50 | grad_clip = 1.0 51 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/dit/train/1x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=1, 6 | frame_interval=1, 7 | image_size=(256, 256), 8 | transform_name="center", 9 | ) 10 | 11 | # Define acceleration 12 | num_workers = 4 13 | dtype = "bf16" 14 | grad_checkpoint = False 15 | plugin = "zero2" 16 | sp_size = 1 17 | 18 | # Define model 19 | model = dict( 20 | type="DiT-XL/2", 21 | no_temporal_pos_emb=True, 22 | enable_flash_attn=True, 23 | enable_layernorm_kernel=True, 24 | ) 25 | vae = dict( 26 | type="VideoAutoencoderKL", 27 | from_pretrained="stabilityai/sd-vae-ft-ema", 28 | ) 29 | text_encoder = dict( 30 | type="clip", 31 | from_pretrained="openai/clip-vit-base-patch32", 32 | model_max_length=77, 33 | ) 34 | scheduler = dict( 35 | type="iddpm", 36 | timestep_respacing="", 37 | ) 38 | 39 | # Others 40 | seed = 42 41 | outputs = "outputs" 42 | wandb = False 43 | 44 | epochs = 1000 45 | log_every = 10 46 | ckpt_every = 1000 47 | load = None 48 | 49 | batch_size = 128 50 | lr = 1e-4 # according to DiT repo 51 | grad_clip = 1.0 52 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/latte/inference/16x256x256-class.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="Latte-XL/2", 8 | condition="label_101", 9 | from_pretrained="Latte-XL-2-256x256-ucf101.pt", 10 | ) 11 | vae = dict( 12 | type="VideoAutoencoderKL", 13 | from_pretrained="stabilityai/sd-vae-ft-ema", 14 | ) 15 | text_encoder = dict( 16 | type="classes", 17 | num_classes=101, 18 | ) 19 | scheduler = dict( 20 | type="dpm-solver", 21 | num_sampling_steps=20, 22 | cfg_scale=4.0, 23 | ) 24 | dtype = "bf16" 25 | 26 | # Others 27 | batch_size = 2 28 | seed = 42 29 | prompt_path = "./assets/texts/ucf101_id.txt" 30 | save_dir = "./samples/samples/" 31 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/latte/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="Latte-XL/2", 8 | condition="text", 9 | from_pretrained="PRETRAINED_MODEL", 10 | ) 11 | vae = dict( 12 | type="VideoAutoencoderKL", 13 | from_pretrained="stabilityai/sd-vae-ft-ema", 14 | ) 15 | text_encoder = dict( 16 | type="clip", 17 | from_pretrained="openai/clip-vit-base-patch32", 18 | model_max_length=77, 19 | ) 20 | scheduler = dict( 21 | type="dpm-solver", 22 | num_sampling_steps=20, 23 | cfg_scale=4.0, 24 | ) 25 | dtype = "bf16" 26 | 27 | # Others 28 | batch_size = 2 29 | seed = 42 30 | prompt_path = "./assets/texts/ucf101_labels.txt" 31 | save_dir = "./samples/samples/" 32 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/latte/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="Latte-XL/2", 20 | enable_flash_attn=True, 21 | enable_layernorm_kernel=True, 22 | ) 23 | vae = dict( 24 | type="VideoAutoencoderKL", 25 | from_pretrained="stabilityai/sd-vae-ft-ema", 26 | ) 27 | text_encoder = dict( 28 | type="clip", 29 | from_pretrained="openai/clip-vit-base-patch32", 30 | model_max_length=77, 31 | ) 32 | scheduler = dict( 33 | type="iddpm", 34 | timestep_respacing="", 35 | ) 36 | 37 | # Others 38 | seed = 42 39 | outputs = "outputs" 40 | wandb = False 41 | 42 | epochs = 1000 43 | log_every = 10 44 | ckpt_every = 1000 45 | load = None 46 | 47 | batch_size = 8 48 | lr = 2e-5 49 | grad_clip = 1.0 50 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora-v1-1/inference/sample.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | frame_interval = 3 3 | fps = 24 4 | image_size = (240, 426) 5 | multi_resolution = "STDiT2" 6 | 7 | # Define model 8 | model = dict( 9 | type="STDiT2-XL/2", 10 | from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3", 11 | input_sq_size=512, 12 | qk_norm=True, 13 | qk_norm_legacy=True, 14 | enable_flash_attn=True, 15 | enable_layernorm_kernel=True, 16 | ) 17 | vae = dict( 18 | type="VideoAutoencoderKL", 19 | from_pretrained="stabilityai/sd-vae-ft-ema", 20 | cache_dir=None, # "/mnt/hdd/cached_models", 21 | micro_batch_size=4, 22 | ) 23 | text_encoder = dict( 24 | type="t5", 25 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 26 | cache_dir=None, # "/mnt/hdd/cached_models", 27 | model_max_length=200, 28 | ) 29 | scheduler = dict( 30 | type="iddpm", 31 | num_sampling_steps=100, 32 | cfg_scale=7.0, 33 | cfg_channel=3, # or None 34 | ) 35 | dtype = "bf16" 36 | 37 | # Condition 38 | prompt_path = "./assets/texts/t2v_samples.txt" 39 | prompt = None # prompt has higher priority than prompt_path 40 | 41 | # Others 42 | batch_size = 1 43 | seed = 42 44 | save_dir = "./samples/samples/" 45 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora-v1-1/train/image.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | data_path=None, 5 | num_frames=None, 6 | frame_interval=3, 7 | image_size=(None, None), 8 | transform_name="resize_crop", 9 | ) 10 | bucket_config = { # 6s/it 11 | "256": {1: (1.0, 256)}, 12 | "512": {1: (1.0, 80)}, 13 | "480p": {1: (1.0, 52)}, 14 | "1024": {1: (1.0, 20)}, 15 | "1080p": {1: (1.0, 8)}, 16 | } 17 | 18 | # Define acceleration 19 | num_workers = 4 20 | num_bucket_build_workers = 16 21 | dtype = "bf16" 22 | grad_checkpoint = True 23 | plugin = "zero2" 24 | sp_size = 1 25 | 26 | # Define model 27 | model = dict( 28 | type="STDiT2-XL/2", 29 | from_pretrained=None, 30 | input_sq_size=512, # pretrained model is trained on 512x512 31 | qk_norm=True, 32 | qk_norm_legacy=True, 33 | enable_flash_attn=True, 34 | enable_layernorm_kernel=True, 35 | ) 36 | vae = dict( 37 | type="VideoAutoencoderKL", 38 | from_pretrained="stabilityai/sd-vae-ft-ema", 39 | micro_batch_size=4, 40 | local_files_only=True, 41 | ) 42 | text_encoder = dict( 43 | type="t5", 44 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 45 | model_max_length=200, 46 | shardformer=True, 47 | local_files_only=True, 48 | ) 49 | scheduler = dict( 50 | type="iddpm", 51 | timestep_respacing="", 52 | ) 53 | 54 | # Others 55 | seed = 42 56 | outputs = "outputs" 57 | wandb = False 58 | 59 | epochs = 1000 60 | log_every = 10 61 | ckpt_every = 500 62 | load = None 63 | 64 | batch_size = 10 # only for logging 65 | lr = 2e-5 66 | grad_clip = 1.0 67 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora-v1-1/train/video.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | data_path=None, 5 | num_frames=None, 6 | frame_interval=3, 7 | image_size=(None, None), 8 | transform_name="resize_crop", 9 | ) 10 | bucket_config = { # 6s/it 11 | "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)}, 12 | "256": {1: (1.0, 256)}, 13 | "512": {1: (0.5, 80)}, 14 | "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)}, 15 | "720p": {16: (0.1, 2), 32: (0.0, None)}, # No examples now 16 | "1024": {1: (0.3, 20)}, 17 | "1080p": {1: (0.3, 8)}, 18 | } 19 | 20 | # Define acceleration 21 | num_workers = 4 22 | num_bucket_build_workers = 16 23 | dtype = "bf16" 24 | grad_checkpoint = True 25 | plugin = "zero2" 26 | sp_size = 1 27 | 28 | # Define model 29 | model = dict( 30 | type="STDiT2-XL/2", 31 | from_pretrained=None, 32 | input_sq_size=512, # pretrained model is trained on 512x512 33 | qk_norm=True, 34 | qk_norm_legacy=True, 35 | enable_flash_attn=True, 36 | enable_layernorm_kernel=True, 37 | ) 38 | vae = dict( 39 | type="VideoAutoencoderKL", 40 | from_pretrained="stabilityai/sd-vae-ft-ema", 41 | micro_batch_size=4, 42 | local_files_only=True, 43 | ) 44 | text_encoder = dict( 45 | type="t5", 46 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 47 | model_max_length=200, 48 | shardformer=True, 49 | local_files_only=True, 50 | ) 51 | scheduler = dict( 52 | type="iddpm", 53 | timestep_respacing="", 54 | ) 55 | 56 | # Others 57 | seed = 42 58 | outputs = "outputs" 59 | wandb = False 60 | 61 | epochs = 1000 62 | log_every = 10 63 | ckpt_every = 500 64 | load = None 65 | 66 | batch_size = 10 # only for logging 67 | lr = 2e-5 68 | grad_clip = 1.0 69 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora-v1-2/inference/sample.py: -------------------------------------------------------------------------------- 1 | resolution = "240p" 2 | aspect_ratio = "9:16" 3 | num_frames = 51 4 | fps = 24 5 | frame_interval = 1 6 | save_fps = 24 7 | 8 | save_dir = "./samples/samples/" 9 | seed = 42 10 | batch_size = 1 11 | multi_resolution = "STDiT2" 12 | dtype = "bf16" 13 | condition_frame_length = 5 14 | align = 5 15 | 16 | # prompt="a beautiful waterfall" 17 | 18 | model = dict( 19 | type="STDiT3-XL/2", 20 | from_pretrained="hpcai-tech/OpenSora-STDiT-v3", 21 | qk_norm=True, 22 | enable_flash_attn=False, 23 | enable_layernorm_kernel=False, 24 | force_huggingface=True, 25 | ) 26 | vae = dict( 27 | type="OpenSoraVAE_V1_2", 28 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 29 | micro_frame_size=17, 30 | micro_batch_size=4, 31 | force_huggingface=True, 32 | ) 33 | text_encoder = dict( 34 | type="t5", 35 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 36 | model_max_length=300, 37 | ) 38 | scheduler = dict( 39 | type="rflow", 40 | use_timestep_transform=True, 41 | num_sampling_steps=30, 42 | cfg_scale=7.0, 43 | ) 44 | 45 | aes = 6.5 46 | flow = None 47 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora-v1-2/inference/test_config.py: -------------------------------------------------------------------------------- 1 | resolution = "144p" 2 | aspect_ratio = "3:4" 3 | num_frames = 68 4 | fps = 24 5 | frame_interval = 1 6 | save_fps = 24 7 | verbose=2 8 | save_dir = "./samples/samples/" 9 | seed = 42 10 | batch_size = 1 11 | multi_resolution = "STDiT2" 12 | dtype = "fp16" 13 | condition_frame_length = 5 14 | align = 5 15 | 16 | model = dict( 17 | type="STDiT3-XL/2", 18 | from_pretrained="hpcai-tech/OpenSora-STDiT-v3", 19 | qk_norm=True, 20 | enable_flash_attn=False, 21 | enable_layernorm_kernel=False, 22 | force_huggingface=True, 23 | merge_steps=0 24 | ) 25 | vae = dict( 26 | type="OpenSoraVAE_V1_2", 27 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 28 | micro_frame_size=17, 29 | micro_batch_size=4, 30 | force_huggingface=True, 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=300, 36 | ) 37 | scheduler = dict( 38 | type="rflow", 39 | use_timestep_transform=True, 40 | num_sampling_steps=30, 41 | cfg_scale=7.0, 42 | 43 | ) 44 | 45 | aes = 6.5 46 | flow = None 47 | 48 | # for ios debug 49 | flag = False 50 | phone_output = False 51 | 52 | # lpl_settings 53 | lpl_setting = 2 54 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora-v1-2/inference/vae_test.py: -------------------------------------------------------------------------------- 1 | resolution = "144p" 2 | aspect_ratio = "3:4" 3 | num_frames = 16 4 | fps = 7 5 | frame_interval = 1 6 | save_fps = 1 7 | 8 | save_dir = "./samples/samples/" 9 | seed = 42 10 | batch_size = 1 11 | multi_resolution = "STDiT2" 12 | dtype = "fp16" 13 | condition_frame_length = 5 14 | align = 5 15 | 16 | model = dict( 17 | type="STDiT3-XL/2", 18 | from_pretrained="hpcai-tech/OpenSora-STDiT-v3", 19 | qk_norm=True, 20 | enable_flash_attn=False, 21 | enable_layernorm_kernel=False, 22 | force_huggingface=True, 23 | ) 24 | vae = dict( 25 | type="OpenSoraVAE_V1_2", 26 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 27 | micro_frame_size=17, 28 | micro_batch_size=4, 29 | force_huggingface=True, 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=300, 35 | ) 36 | scheduler = dict( 37 | type="rflow", 38 | use_timestep_transform=True, 39 | num_sampling_steps=30, 40 | cfg_scale=7.0, 41 | ) 42 | 43 | aes = 6.5 44 | flow = None 45 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora-v1-2/misc/eval_loss.py: -------------------------------------------------------------------------------- 1 | num_workers = 8 2 | dtype = "bf16" 3 | seed = 42 4 | num_eval_timesteps = 10 5 | 6 | # Dataset settings 7 | dataset = dict( 8 | type="VariableVideoTextDataset", 9 | transform_name="resize_crop", 10 | ) 11 | 12 | bucket_config = { 13 | "144p": {1: (None, 100), 51: (None, 30), 102: (None, 20), 204: (None, 8), 408: (None, 4)}, 14 | # --- 15 | "240p": {1: (None, 100), 51: (None, 24), 102: (None, 12), 204: (None, 4), 408: (None, 2)}, 16 | # --- 17 | "360p": {1: (None, 60), 51: (None, 12), 102: (None, 6), 204: (None, 2), 408: (None, 1)}, 18 | # --- 19 | "480p": {1: (None, 40), 51: (None, 6), 102: (None, 3), 204: (None, 1)}, 20 | # --- 21 | "720p": {1: (None, 20), 51: (None, 2), 102: (None, 1)}, 22 | # --- 23 | "1080p": {1: (None, 10)}, 24 | # --- 25 | "2048": {1: (None, 5)}, 26 | } 27 | 28 | # Model settings 29 | model = dict( 30 | type="STDiT3-XL/2", 31 | from_pretrained=None, 32 | qk_norm=True, 33 | enable_flash_attn=True, 34 | enable_layernorm_kernel=True, 35 | ) 36 | vae = dict( 37 | type="OpenSoraVAE_V1_2", 38 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 39 | micro_frame_size=17, 40 | micro_batch_size=4, 41 | local_files_only=True, 42 | ) 43 | text_encoder = dict( 44 | type="t5", 45 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 46 | model_max_length=300, 47 | local_files_only=True, 48 | ) 49 | scheduler = dict(type="rflow") 50 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora-v1-2/train/demo_360p.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = {"360p": {102: (1.0, 5)}} 9 | grad_checkpoint = True 10 | 11 | # Acceleration settings 12 | num_workers = 8 13 | num_bucket_build_workers = 16 14 | dtype = "bf16" 15 | plugin = "zero2" 16 | 17 | # Model settings 18 | model = dict( 19 | type="STDiT3-XL/2", 20 | from_pretrained=None, 21 | qk_norm=True, 22 | enable_flash_attn=True, 23 | enable_layernorm_kernel=True, 24 | freeze_y_embedder=True, 25 | ) 26 | vae = dict( 27 | type="OpenSoraVAE_V1_2", 28 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 29 | micro_frame_size=17, 30 | micro_batch_size=4, 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=300, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="rflow", 40 | use_timestep_transform=True, 41 | sample_method="logit-normal", 42 | ) 43 | 44 | # Log settings 45 | seed = 42 46 | outputs = "outputs" 47 | wandb = False 48 | epochs = 1000 49 | log_every = 10 50 | ckpt_every = 200 51 | 52 | # optimization settings 53 | load = None 54 | grad_clip = 1.0 55 | lr = 1e-4 56 | ema_decay = 0.99 57 | adam_eps = 1e-15 58 | warmup_steps = 1000 59 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora-v1-2/train/demo_480p.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict( 3 | type="VariableVideoTextDataset", 4 | transform_name="resize_crop", 5 | ) 6 | 7 | # webvid 8 | bucket_config = {"480p": {51: (0.5, 5)}} 9 | grad_checkpoint = True 10 | 11 | # Acceleration settings 12 | num_workers = 8 13 | num_bucket_build_workers = 16 14 | dtype = "bf16" 15 | plugin = "zero2" 16 | 17 | # Model settings 18 | model = dict( 19 | type="STDiT3-XL/2", 20 | from_pretrained=None, 21 | qk_norm=True, 22 | enable_flash_attn=True, 23 | enable_layernorm_kernel=True, 24 | freeze_y_embedder=True, 25 | ) 26 | vae = dict( 27 | type="OpenSoraVAE_V1_2", 28 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 29 | micro_frame_size=17, 30 | micro_batch_size=4, 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=300, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="rflow", 40 | use_timestep_transform=True, 41 | sample_method="logit-normal", 42 | ) 43 | 44 | # Log settings 45 | seed = 42 46 | outputs = "outputs" 47 | wandb = False 48 | epochs = 1000 49 | log_every = 10 50 | ckpt_every = 200 51 | 52 | # optimization settings 53 | load = None 54 | grad_clip = 1.0 55 | lr = 1e-4 56 | ema_decay = 0.99 57 | adam_eps = 1e-15 58 | warmup_steps = 1000 59 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora-v1-2/train/stage1_feat.py: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | dataset = dict(type="BatchFeatureDataset") 3 | grad_checkpoint = True 4 | num_workers = 4 5 | 6 | # Acceleration settings 7 | dtype = "bf16" 8 | plugin = "zero2" 9 | 10 | # Model settings 11 | model = dict( 12 | type="STDiT3-XL/2", 13 | from_pretrained=None, 14 | qk_norm=True, 15 | enable_flash_attn=True, 16 | enable_layernorm_kernel=True, 17 | freeze_y_embedder=True, 18 | skip_y_embedder=True, 19 | ) 20 | scheduler = dict( 21 | type="rflow", 22 | use_timestep_transform=True, 23 | sample_method="logit-normal", 24 | ) 25 | 26 | vae_out_channels = 4 27 | model_max_length = 300 28 | text_encoder_output_dim = 4096 29 | load_video_features = True 30 | load_text_features = True 31 | 32 | # Mask settings 33 | mask_ratios = { 34 | "random": 0.2, 35 | "intepolate": 0.01, 36 | "quarter_random": 0.01, 37 | "quarter_head": 0.01, 38 | "quarter_tail": 0.01, 39 | "quarter_head_tail": 0.01, 40 | "image_random": 0.05, 41 | "image_head": 0.1, 42 | "image_tail": 0.05, 43 | "image_head_tail": 0.05, 44 | } 45 | 46 | # Log settings 47 | seed = 42 48 | outputs = "outputs" 49 | wandb = False 50 | epochs = 1000 51 | log_every = 10 52 | ckpt_every = 500 53 | 54 | # optimization settings 55 | load = None 56 | grad_clip = 1.0 57 | lr = 2e-4 58 | ema_decay = 0.99 59 | adam_eps = 1e-15 60 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 24 // 3 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=0.5, 9 | time_scale=1.0, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=4, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="iddpm", 26 | num_sampling_steps=100, 27 | cfg_scale=7.0, 28 | cfg_channel=3, # or None 29 | ) 30 | dtype = "bf16" 31 | 32 | # Condition 33 | prompt_path = "./assets/texts/t2v_samples.txt" 34 | prompt = None # prompt has higher priority than prompt_path 35 | 36 | # Others 37 | batch_size = 1 38 | seed = 42 39 | save_dir = "./samples/samples/" 40 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/inference/16x512x512-rflow.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 24 // 3 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=2, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="rflow", 26 | num_sampling_steps=10, 27 | cfg_scale=7.0, 28 | ) 29 | dtype = "bf16" 30 | 31 | # Others 32 | batch_size = 2 33 | seed = 42 34 | prompt_path = "./assets/texts/t2v_samples.txt" 35 | save_dir = "./outputs/samples/" 36 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/inference/16x512x512.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 24 // 3 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=2, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="iddpm", 26 | num_sampling_steps=100, 27 | cfg_scale=7.0, 28 | ) 29 | dtype = "bf16" 30 | 31 | # Others 32 | batch_size = 2 33 | seed = 42 34 | prompt_path = "./assets/texts/t2v_samples.txt" 35 | save_dir = "./samples/samples/" 36 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/inference/64x512x512.py: -------------------------------------------------------------------------------- 1 | num_frames = 64 2 | fps = 24 // 2 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="STDiT-XL/2", 8 | space_scale=1.0, 9 | time_scale=2 / 3, 10 | enable_flash_attn=True, 11 | enable_layernorm_kernel=True, 12 | from_pretrained="PRETRAINED_MODEL", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | micro_batch_size=128, 18 | ) 19 | text_encoder = dict( 20 | type="t5", 21 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 22 | model_max_length=120, 23 | ) 24 | scheduler = dict( 25 | type="iddpm", 26 | num_sampling_steps=100, 27 | cfg_scale=7.0, 28 | ) 29 | dtype = "bf16" 30 | 31 | # Others 32 | batch_size = 1 33 | seed = 42 34 | prompt_path = "./assets/texts/t2v_samples.txt" 35 | save_dir = "./samples/samples/" 36 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/train/16x256x256-mask.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | mask_ratios = { 27 | "identity": 0.7, 28 | "random": 0.15, 29 | "mask_head": 0.05, 30 | "mask_tail": 0.05, 31 | "mask_head_tail": 0.05, 32 | } 33 | vae = dict( 34 | type="VideoAutoencoderKL", 35 | from_pretrained="stabilityai/sd-vae-ft-ema", 36 | ) 37 | text_encoder = dict( 38 | type="t5", 39 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 40 | model_max_length=120, 41 | shardformer=True, 42 | ) 43 | scheduler = dict( 44 | type="iddpm", 45 | timestep_respacing="", 46 | ) 47 | 48 | # Others 49 | seed = 42 50 | outputs = "outputs" 51 | wandb = False 52 | 53 | epochs = 1000 54 | log_every = 10 55 | ckpt_every = 1000 56 | load = None 57 | 58 | batch_size = 8 59 | lr = 2e-5 60 | grad_clip = 1.0 61 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/train/16x256x256-spee-rflow.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | # from_pretrained="PixArt-XL-2-512x512.pth", 23 | # from_pretrained = "/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/OpenSora-v1-HQ-16x512x512.pth", 24 | # from_pretrained = "OpenSora-v1-HQ-16x512x512.pth", 25 | from_pretrained="PRETRAINED_MODEL", 26 | enable_flash_attn=True, 27 | enable_layernorm_kernel=True, 28 | ) 29 | # mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07] 30 | # mask_ratios = { 31 | # "identity": 0.9, 32 | # "random": 0.06, 33 | # "mask_head": 0.01, 34 | # "mask_tail": 0.01, 35 | # "mask_head_tail": 0.02, 36 | # } 37 | vae = dict( 38 | type="VideoAutoencoderKL", 39 | from_pretrained="stabilityai/sd-vae-ft-ema", 40 | ) 41 | text_encoder = dict( 42 | type="t5", 43 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 44 | model_max_length=120, 45 | shardformer=True, 46 | ) 47 | scheduler = dict( 48 | type="rflow", 49 | # timestep_respacing="", 50 | ) 51 | 52 | # Others 53 | seed = 42 54 | outputs = "outputs" 55 | wandb = True 56 | 57 | epochs = 1 58 | log_every = 10 59 | ckpt_every = 1000 60 | load = None 61 | 62 | batch_size = 16 63 | lr = 2e-5 64 | grad_clip = 1.0 65 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/train/16x256x256-spee.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | mask_ratios = { 27 | "identity": 0.5, 28 | "random": 0.29, 29 | "mask_head": 0.07, 30 | "mask_tail": 0.07, 31 | "mask_head_tail": 0.07, 32 | } 33 | vae = dict( 34 | type="VideoAutoencoderKL", 35 | from_pretrained="stabilityai/sd-vae-ft-ema", 36 | ) 37 | text_encoder = dict( 38 | type="t5", 39 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 40 | model_max_length=120, 41 | shardformer=True, 42 | ) 43 | scheduler = dict( 44 | type="iddpm-speed", 45 | timestep_respacing="", 46 | ) 47 | 48 | # Others 49 | seed = 42 50 | outputs = "outputs" 51 | wandb = False 52 | 53 | epochs = 1000 54 | log_every = 10 55 | ckpt_every = 1000 56 | load = None 57 | 58 | batch_size = 8 59 | lr = 2e-5 60 | grad_clip = 1.0 61 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 0 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | ) 30 | text_encoder = dict( 31 | type="t5", 32 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 33 | model_max_length=120, 34 | shardformer=True, 35 | ) 36 | scheduler = dict( 37 | type="iddpm", 38 | timestep_respacing="", 39 | ) 40 | 41 | # Others 42 | seed = 42 43 | outputs = "outputs" 44 | wandb = False 45 | 46 | epochs = 1000 47 | log_every = 10 48 | ckpt_every = 1000 49 | load = None 50 | 51 | batch_size = 8 52 | lr = 2e-5 53 | grad_clip = 1.0 54 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/train/16x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=1.0, 21 | time_scale=1.0, 22 | from_pretrained=None, 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | micro_batch_size=128, 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 500 50 | load = None 51 | 52 | batch_size = 8 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/train/360x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=360, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define acceleration 18 | dtype = "bf16" 19 | grad_checkpoint = True 20 | plugin = "zero2-seq" 21 | sp_size = 2 22 | 23 | # Define model 24 | model = dict( 25 | type="STDiT-XL/2", 26 | space_scale=1.0, 27 | time_scale=2 / 3, 28 | from_pretrained=None, 29 | enable_flash_attn=True, 30 | enable_layernorm_kernel=True, 31 | enable_sequence_parallelism=True, # enable sq here 32 | ) 33 | vae = dict( 34 | type="VideoAutoencoderKL", 35 | from_pretrained="stabilityai/sd-vae-ft-ema", 36 | micro_batch_size=128, 37 | ) 38 | text_encoder = dict( 39 | type="t5", 40 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 41 | model_max_length=120, 42 | shardformer=True, 43 | ) 44 | scheduler = dict( 45 | type="iddpm", 46 | timestep_respacing="", 47 | ) 48 | 49 | # Others 50 | seed = 42 51 | outputs = "outputs" 52 | wandb = False 53 | 54 | epochs = 1000 55 | log_every = 10 56 | ckpt_every = 250 57 | load = None 58 | 59 | batch_size = 1 60 | lr = 2e-5 61 | grad_clip = 1.0 62 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/train/64x512x512-sp.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 2 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=1.0, 21 | time_scale=2 / 3, 22 | from_pretrained=None, 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | enable_sequence_parallelism=True, # enable sq here 26 | ) 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="stabilityai/sd-vae-ft-ema", 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 1000 50 | load = None 51 | 52 | batch_size = 1 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/opensora/train/64x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=64, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="STDiT-XL/2", 20 | space_scale=1.0, 21 | time_scale=2 / 3, 22 | from_pretrained=None, 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | micro_batch_size=64, 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 250 50 | load = None 51 | 52 | batch_size = 4 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/inference/16x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 16 2 | fps = 8 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=0.5, 9 | time_scale=1.0, 10 | from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt", 11 | ) 12 | vae = dict( 13 | type="VideoAutoencoderKL", 14 | from_pretrained="stabilityai/sd-vae-ft-ema", 15 | ) 16 | text_encoder = dict( 17 | type="t5", 18 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 19 | model_max_length=120, 20 | ) 21 | scheduler = dict( 22 | type="dpm-solver", 23 | num_sampling_steps=20, 24 | cfg_scale=7.0, 25 | ) 26 | dtype = "bf16" 27 | 28 | # Others 29 | batch_size = 2 30 | seed = 42 31 | prompt_path = "./assets/texts/t2v_samples.txt" 32 | save_dir = "./samples/samples/" 33 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/inference/1x1024MS.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (1920, 512) 4 | multi_resolution = "PixArtMS" 5 | 6 | # Define model 7 | model = dict( 8 | type="PixArtMS-XL/2", 9 | space_scale=2.0, 10 | time_scale=1.0, 11 | no_temporal_pos_emb=True, 12 | from_pretrained="PixArt-XL-2-1024-MS.pth", 13 | ) 14 | vae = dict( 15 | type="VideoAutoencoderKL", 16 | from_pretrained="stabilityai/sd-vae-ft-ema", 17 | ) 18 | text_encoder = dict( 19 | type="t5", 20 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 21 | model_max_length=120, 22 | ) 23 | scheduler = dict( 24 | type="dpm-solver", 25 | num_sampling_steps=20, 26 | cfg_scale=7.0, 27 | ) 28 | dtype = "bf16" 29 | 30 | # Others 31 | batch_size = 2 32 | seed = 42 33 | prompt_path = "./assets/texts/t2i_samples.txt" 34 | save_dir = "./samples/samples/" 35 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/inference/1x20481B.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (2560, 1536) 4 | # image_size = (2048, 2048) 5 | 6 | model = dict( 7 | type="PixArt-1B/2", 8 | from_pretrained="PixArt-1B-2.pth", 9 | space_scale=4, 10 | no_temporal_pos_emb=True, 11 | enable_flash_attn=True, 12 | enable_layernorm_kernel=True, 13 | base_size=2048 // 8, 14 | ) 15 | vae = dict( 16 | type="VideoAutoencoderKL", 17 | from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", 18 | subfolder="vae", 19 | ) 20 | text_encoder = dict( 21 | type="t5", 22 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 23 | model_max_length=300, 24 | ) 25 | scheduler = dict( 26 | type="dpm-solver", 27 | num_sampling_steps=14, 28 | cfg_scale=4.5, 29 | ) 30 | dtype = "bf16" 31 | 32 | # Others 33 | batch_size = 1 34 | seed = 42 35 | prompt_path = "./assets/texts/t2i_sigma.txt" 36 | save_dir = "./samples/samples/" 37 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/inference/1x2048MS.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (2560, 1536) 4 | # image_size = (2048, 2048) 5 | 6 | model = dict( 7 | type="PixArt-XL/2", 8 | from_pretrained="PixArt-Sigma-XL-2-2K-MS.pth", 9 | space_scale=4, 10 | no_temporal_pos_emb=True, 11 | enable_flash_attn=True, 12 | enable_layernorm_kernel=True, 13 | base_size=2048 // 8, 14 | ) 15 | vae = dict( 16 | type="VideoAutoencoderKL", 17 | from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", 18 | subfolder="vae", 19 | ) 20 | text_encoder = dict( 21 | type="t5", 22 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 23 | model_max_length=300, 24 | ) 25 | scheduler = dict( 26 | type="dpm-solver", 27 | num_sampling_steps=14, 28 | cfg_scale=4.5, 29 | ) 30 | dtype = "bf16" 31 | 32 | # Others 33 | batch_size = 1 34 | seed = 42 35 | prompt_path = "./assets/texts/t2i_sigma.txt" 36 | save_dir = "./samples/samples/" 37 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/inference/1x256x256.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (256, 256) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | no_temporal_pos_emb=True, 11 | from_pretrained="PixArt-XL-2-256x256.pth", 12 | ) 13 | vae = dict( 14 | type="VideoAutoencoderKL", 15 | from_pretrained="stabilityai/sd-vae-ft-ema", 16 | ) 17 | text_encoder = dict( 18 | type="t5", 19 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 20 | model_max_length=120, 21 | ) 22 | scheduler = dict( 23 | type="dpm-solver", 24 | num_sampling_steps=20, 25 | cfg_scale=7.0, 26 | ) 27 | dtype = "bf16" 28 | 29 | # Others 30 | batch_size = 2 31 | seed = 42 32 | prompt_path = "./assets/texts/t2i_samples.txt" 33 | save_dir = "./samples/samples/" 34 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/inference/1x512x512-rflow.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | no_temporal_pos_emb=True, 11 | from_pretrained="PRETRAINED_MODEL", 12 | ) 13 | vae = dict( 14 | type="VideoAutoencoderKL", 15 | from_pretrained="stabilityai/sd-vae-ft-ema", 16 | ) 17 | text_encoder = dict( 18 | type="t5", 19 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 20 | model_max_length=120, 21 | ) 22 | scheduler = dict( 23 | type="rflow", 24 | num_sampling_steps=20, 25 | cfg_scale=7.0, 26 | ) 27 | dtype = "bf16" 28 | 29 | # prompt_path = "./assets/texts/t2i_samples.txt" 30 | prompt = [ 31 | "Pirate ship trapped in a cosmic maelstrom nebula.", 32 | "A small cactus with a happy face in the Sahara desert.", 33 | "A small cactus with a sad face in the Sahara desert.", 34 | ] 35 | 36 | # Others 37 | batch_size = 2 38 | seed = 42 39 | save_dir = "./outputs/samples2/" 40 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/inference/1x512x512.py: -------------------------------------------------------------------------------- 1 | num_frames = 1 2 | fps = 1 3 | image_size = (512, 512) 4 | 5 | # Define model 6 | model = dict( 7 | type="PixArt-XL/2", 8 | space_scale=1.0, 9 | time_scale=1.0, 10 | no_temporal_pos_emb=True, 11 | from_pretrained="PixArt-XL-2-512x512.pth", 12 | ) 13 | vae = dict( 14 | type="VideoAutoencoderKL", 15 | from_pretrained="stabilityai/sd-vae-ft-ema", 16 | ) 17 | text_encoder = dict( 18 | type="t5", 19 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 20 | model_max_length=120, 21 | ) 22 | scheduler = dict( 23 | type="dpm-solver", 24 | num_sampling_steps=20, 25 | cfg_scale=7.0, 26 | ) 27 | dtype = "bf16" 28 | 29 | # prompt_path = "./assets/texts/t2i_samples.txt" 30 | prompt = [ 31 | "Pirate ship trapped in a cosmic maelstrom nebula.", 32 | "A small cactus with a happy face in the Sahara desert.", 33 | "A small cactus with a sad face in the Sahara desert.", 34 | ] 35 | 36 | # Others 37 | batch_size = 2 38 | seed = 42 39 | save_dir = "./samples/samples/" 40 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/train/16x256x256.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=16, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-XL/2", 20 | space_scale=0.5, 21 | time_scale=1.0, 22 | from_pretrained="PixArt-XL-2-512x512.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | vae = dict( 27 | type="VideoAutoencoderKL", 28 | from_pretrained="stabilityai/sd-vae-ft-ema", 29 | ) 30 | text_encoder = dict( 31 | type="t5", 32 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 33 | model_max_length=120, 34 | shardformer=True, 35 | ) 36 | scheduler = dict( 37 | type="iddpm", 38 | timestep_respacing="", 39 | ) 40 | 41 | # Others 42 | seed = 42 43 | outputs = "outputs" 44 | wandb = False 45 | 46 | epochs = 1000 47 | log_every = 10 48 | ckpt_every = 1000 49 | load = None 50 | 51 | batch_size = 8 52 | lr = 2e-5 53 | grad_clip = 1.0 54 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/train/1x2048x2048.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path="/home/zhaowangbo/data/csv/image-v1_1_ext_noempty_rcp_clean_info.csv", 5 | num_frames=1, 6 | frame_interval=3, 7 | image_size=(2048, 2048), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-1B/2", 20 | space_scale=4.0, 21 | no_temporal_pos_emb=True, 22 | from_pretrained="PixArt-1B-2.pth", 23 | enable_flash_attn=True, 24 | enable_layernorm_kernel=True, 25 | ) 26 | 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", 30 | subfolder="vae", 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=300, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 1000 50 | load = None 51 | 52 | batch_size = 4 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/train/1x512x512-rflow.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=1, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-XL/2", 20 | space_scale=1.0, 21 | time_scale=1.0, 22 | no_temporal_pos_emb=True, 23 | # from_pretrained="PixArt-XL-2-512x512.pth", 24 | from_pretrained="PRETRAINED_MODEL", 25 | enable_flash_attn=True, 26 | enable_layernorm_kernel=True, 27 | ) 28 | vae = dict( 29 | type="VideoAutoencoderKL", 30 | from_pretrained="stabilityai/sd-vae-ft-ema", 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=120, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="rflow", 40 | # timestep_respacing="", 41 | ) 42 | 43 | # Others 44 | seed = 42 45 | outputs = "outputs" 46 | wandb = True 47 | 48 | epochs = 2 49 | log_every = 10 50 | ckpt_every = 1000 51 | load = None 52 | 53 | batch_size = 64 54 | lr = 2e-5 55 | grad_clip = 1.0 56 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/train/1x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=1, 6 | frame_interval=3, 7 | image_size=(512, 512), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | # Define model 18 | model = dict( 19 | type="PixArt-XL/2", 20 | space_scale=1.0, 21 | time_scale=1.0, 22 | no_temporal_pos_emb=True, 23 | from_pretrained="PixArt-XL-2-512x512.pth", 24 | enable_flash_attn=True, 25 | enable_layernorm_kernel=True, 26 | ) 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="stabilityai/sd-vae-ft-ema", 30 | ) 31 | text_encoder = dict( 32 | type="t5", 33 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 34 | model_max_length=120, 35 | shardformer=True, 36 | ) 37 | scheduler = dict( 38 | type="iddpm", 39 | timestep_respacing="", 40 | ) 41 | 42 | # Others 43 | seed = 42 44 | outputs = "outputs" 45 | wandb = False 46 | 47 | epochs = 1000 48 | log_every = 10 49 | ckpt_every = 1000 50 | load = None 51 | 52 | batch_size = 32 53 | lr = 2e-5 54 | grad_clip = 1.0 55 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/pixart/train/64x512x512.py: -------------------------------------------------------------------------------- 1 | # Define dataset 2 | dataset = dict( 3 | type="VideoTextDataset", 4 | data_path=None, 5 | num_frames=64, 6 | frame_interval=3, 7 | image_size=(256, 256), 8 | ) 9 | 10 | # Define acceleration 11 | num_workers = 4 12 | dtype = "bf16" 13 | grad_checkpoint = True 14 | plugin = "zero2" 15 | sp_size = 1 16 | 17 | 18 | # Define model 19 | model = dict( 20 | type="PixArt-XL/2", 21 | space_scale=1.0, 22 | time_scale=2 / 3, 23 | from_pretrained=None, 24 | enable_flash_attn=True, 25 | enable_layernorm_kernel=True, 26 | ) 27 | vae = dict( 28 | type="VideoAutoencoderKL", 29 | from_pretrained="stabilityai/sd-vae-ft-ema", 30 | micro_batch_size=128, 31 | ) 32 | text_encoder = dict( 33 | type="t5", 34 | from_pretrained="DeepFloyd/t5-v1_1-xxl", 35 | model_max_length=120, 36 | shardformer=True, 37 | ) 38 | scheduler = dict( 39 | type="iddpm", 40 | timestep_respacing="", 41 | ) 42 | 43 | # Others 44 | seed = 42 45 | outputs = "outputs" 46 | wandb = False 47 | 48 | epochs = 1000 49 | log_every = 10 50 | ckpt_every = 250 51 | load = None 52 | 53 | batch_size = 4 54 | lr = 2e-5 55 | grad_clip = 1.0 56 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/vae/inference/image.py: -------------------------------------------------------------------------------- 1 | image_size = (256, 256) 2 | num_frames = 1 3 | 4 | dtype = "bf16" 5 | batch_size = 1 6 | seed = 42 7 | save_dir = "samples/vae_video" 8 | cal_stats = True 9 | log_stats_every = 100 10 | 11 | # Define dataset 12 | dataset = dict( 13 | type="VideoTextDataset", 14 | data_path=None, 15 | num_frames=num_frames, 16 | image_size=image_size, 17 | ) 18 | num_samples = 100 19 | num_workers = 4 20 | 21 | # Define model 22 | model = dict( 23 | type="OpenSoraVAE_V1_2", 24 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 25 | micro_frame_size=None, 26 | micro_batch_size=4, 27 | cal_loss=True, 28 | ) 29 | 30 | # loss weights 31 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 32 | kl_loss_weight = 1e-6 33 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/vae/inference/video.py: -------------------------------------------------------------------------------- 1 | image_size = (256, 256) 2 | num_frames = 51 3 | 4 | dtype = "bf16" 5 | batch_size = 1 6 | seed = 42 7 | save_dir = "samples/vae_video" 8 | cal_stats = True 9 | log_stats_every = 100 10 | 11 | # Define dataset 12 | dataset = dict( 13 | type="VideoTextDataset", 14 | data_path=None, 15 | num_frames=num_frames, 16 | image_size=image_size, 17 | ) 18 | num_samples = 100 19 | num_workers = 4 20 | 21 | # Define model 22 | model = dict( 23 | type="OpenSoraVAE_V1_2", 24 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 25 | micro_frame_size=None, 26 | micro_batch_size=4, 27 | cal_loss=True, 28 | ) 29 | 30 | # loss weights 31 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 32 | kl_loss_weight = 1e-6 33 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/vae/train/stage1.py: -------------------------------------------------------------------------------- 1 | num_frames = 17 2 | image_size = (256, 256) 3 | 4 | # Define dataset 5 | dataset = dict( 6 | type="VideoTextDataset", 7 | data_path=None, 8 | num_frames=num_frames, 9 | frame_interval=1, 10 | image_size=image_size, 11 | ) 12 | 13 | # Define acceleration 14 | num_workers = 16 15 | dtype = "bf16" 16 | grad_checkpoint = True 17 | plugin = "zero2" 18 | 19 | # Define model 20 | model = dict( 21 | type="OpenSoraVAE_V1_2", 22 | freeze_vae_2d=True, 23 | from_pretrained=None, 24 | cal_loss=True, 25 | ) 26 | 27 | # loss weights 28 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 29 | kl_loss_weight = 1e-6 30 | 31 | mixed_strategy = "mixed_video_image" 32 | mixed_image_ratio = 0.2 33 | use_real_rec_loss = False 34 | use_z_rec_loss = True 35 | use_image_identity_loss = True 36 | 37 | # Others 38 | seed = 42 39 | outputs = "outputs/vae_stage1" 40 | wandb = False 41 | 42 | epochs = 100 # NOTE: adjust accordingly w.r.t dataset size 43 | log_every = 1 44 | ckpt_every = 1000 45 | load = None 46 | 47 | batch_size = 1 48 | lr = 1e-5 49 | grad_clip = 1.0 50 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/vae/train/stage2.py: -------------------------------------------------------------------------------- 1 | num_frames = 17 2 | image_size = (256, 256) 3 | 4 | # Define dataset 5 | dataset = dict( 6 | type="VideoTextDataset", 7 | data_path=None, 8 | num_frames=num_frames, 9 | frame_interval=1, 10 | image_size=image_size, 11 | ) 12 | 13 | # Define acceleration 14 | num_workers = 16 15 | dtype = "bf16" 16 | grad_checkpoint = True 17 | plugin = "zero2" 18 | 19 | # Define model 20 | model = dict( 21 | type="OpenSoraVAE_V1_2", 22 | freeze_vae_2d=False, 23 | from_pretrained="outputs/vae_stage1", 24 | cal_loss=True, 25 | ) 26 | 27 | # loss weights 28 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 29 | kl_loss_weight = 1e-6 30 | 31 | mixed_strategy = "mixed_video_image" 32 | mixed_image_ratio = 0.2 33 | use_real_rec_loss = False 34 | use_z_rec_loss = True 35 | use_image_identity_loss = False 36 | 37 | # Others 38 | seed = 42 39 | outputs = "outputs/vae_stage2" 40 | wandb = False 41 | 42 | epochs = 100 # NOTE: adjust accordingly w.r.t dataset size 43 | log_every = 1 44 | ckpt_every = 1000 45 | load = None 46 | 47 | batch_size = 1 48 | lr = 1e-5 49 | grad_clip = 1.0 50 | -------------------------------------------------------------------------------- /Modded_Open_Sora/configs/vae/train/stage3.py: -------------------------------------------------------------------------------- 1 | num_frames = 33 2 | image_size = (256, 256) 3 | 4 | # Define dataset 5 | dataset = dict( 6 | type="VideoTextDataset", 7 | data_path=None, 8 | num_frames=num_frames, 9 | frame_interval=1, 10 | image_size=image_size, 11 | ) 12 | 13 | # Define acceleration 14 | num_workers = 16 15 | dtype = "bf16" 16 | grad_checkpoint = True 17 | plugin = "zero2" 18 | 19 | # Define model 20 | model = dict( 21 | type="OpenSoraVAE_V1_2", 22 | freeze_vae_2d=False, 23 | from_pretrained="outputs/vae_stage2", 24 | cal_loss=True, 25 | ) 26 | 27 | # loss weights 28 | perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 29 | kl_loss_weight = 1e-6 30 | 31 | mixed_strategy = "mixed_video_random" 32 | use_real_rec_loss = True 33 | use_z_rec_loss = False 34 | use_image_identity_loss = False 35 | 36 | # Others 37 | seed = 42 38 | outputs = "outputs/vae_stage3" 39 | wandb = False 40 | 41 | epochs = 100 # NOTE: adjust accordingly w.r.t dataset size 42 | log_every = 1 43 | ckpt_every = 1000 44 | load = None 45 | 46 | batch_size = 1 47 | lr = 1e-5 48 | grad_clip = 1.0 49 | -------------------------------------------------------------------------------- /Modded_Open_Sora/coreml-export/t5/model/tokenizer.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/coreml-export/t5/model/tokenizer.pth -------------------------------------------------------------------------------- /Modded_Open_Sora/coreml-export/t5/model/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"eos_token": "", "unk_token": "", "pad_token": "", "extra_ids": 100, "additional_special_tokens": ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], "model_max_length": 512, "name_or_path": "t5-small"} -------------------------------------------------------------------------------- /Modded_Open_Sora/coreml-export/vae/vae.py: -------------------------------------------------------------------------------- 1 | import coremltools as ct 2 | import pickle 3 | import torch 4 | from opensora.registry import MODELS, build_module 5 | 6 | vae_config = dict( 7 | type="OpenSoraVAE_V1_2", 8 | from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", 9 | micro_frame_size=17, 10 | micro_batch_size=4, 11 | force_huggingface=True, 12 | ) 13 | device = 'cpu' 14 | dtype = torch.float32 15 | vae = build_module(vae_config, MODELS).to(device, dtype).eval() 16 | 17 | vae_input_shape = torch.Size([1, 4, 4, 20, 27]) 18 | tmp_input = torch.randn(vae_input_shape, dtype=dtype, device=device) 19 | num_frames = 16 20 | num_frames = torch.tensor(num_frames, dtype=torch.int32) 21 | 22 | for param in vae.parameters(): 23 | param.requires_grad = False 24 | 25 | # Define a wrapper function that includes both inputs 26 | def decode_wrapper(latents, num_frames): 27 | with torch.no_grad(): 28 | return vae.decode(z=latents, num_frames=num_frames) 29 | 30 | # Script the wrapper function 31 | scripted_vae = torch.jit.trace(decode_wrapper, example_inputs=[tmp_input, num_frames]) 32 | 33 | # converted_vae = ct.converters.convert(scripted_vae, 34 | # convert_to='mlprogram', 35 | # inputs=[ct.TensorType(name='latents', shape=vae_input_shape), 36 | # ct.TensorType(name='num_frames', shape=[1])], 37 | # minimum_deployment_target=ct.target.iOS17 38 | # ) 39 | 40 | # converted_vae.save('vae.mlpackage') -------------------------------------------------------------------------------- /Modded_Open_Sora/docs/zh_CN/datasets.md: -------------------------------------------------------------------------------- 1 | # 数据集 2 | 3 | ## 正在使用的数据集 4 | 5 | ### HD-VG-130M 6 | 7 | [HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) 包括 130M 个文本视频对。标题是 8 | 由 BLIP-2 生成。我们发现剪切和文本质量相对较差。它包含 20 个拆分。对于 OpenSora 1.0,我们使用第一个拆分。我们计划使用整个数据集并对其进行重新处理。 9 | 10 | ### Inter4k 11 | 12 | [Inter4k](https://github.com/alexandrosstergiou/Inter4K) 是一个包含分辨率为 4K 的 1k 视频剪辑的数据集。这个 13 | 数据集被提议用于超分辨率任务。我们使用数据集进行 HQ 训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。 14 | 15 | ### Pexels.com 16 | 17 | [Pexels.com](https://www.pexels.com/) 是一个提供免费库存照片和视频的网站。我们收集的 19K 视频 18 | 来自本网站的剪辑,用于高质量训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。 19 | 20 | ## 数据集监视列表 21 | 22 | 我们也在关注以下数据集,并考虑在未来使用它们,这取决于我们的存储空间以及数据集的质量。 23 | 24 | | 名称 | 大小 | 描述 | 25 | |-------------------|--------------|-------------------------------| 26 | | Panda-70M | 70M videos | High quality video-text pairs | 27 | | WebVid-10M | 10M videos | Low quality | 28 | | InternVid-10M-FLT | 10M videos | | 29 | | EGO4D | 3670 hours | | 30 | | OpenDV-YouTube | 1700 hours | | 31 | | VidProM | 6.69M videos | | 32 | -------------------------------------------------------------------------------- /Modded_Open_Sora/eval/human_eval/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | NUM_FRAMES=$2 5 | MODEL_NAME=$3 6 | 7 | if [[ $CKPT == *"ema"* ]]; then 8 | parentdir=$(dirname $CKPT) 9 | CKPT_BASE=$(basename $parentdir)_ema 10 | else 11 | CKPT_BASE=$(basename $CKPT) 12 | fi 13 | LOG_BASE=$(dirname $CKPT)/eval 14 | mkdir -p ${LOG_BASE} 15 | echo "Logging to $LOG_BASE" 16 | 17 | GPUS=(0 1 2 3 4 5 6 7) 18 | # TASK_ID_LIST=(1 2a 2b 2c 2d 2e 2f 2g) # move image to video task 19 | TASK_ID_LIST=(2a 2b 2c 2d 2e 2f 2g 2h) 20 | # FRAME_LIST=(1 $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES) 21 | 22 | for i in "${!GPUS[@]}"; do 23 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -${TASK_ID_LIST[i]} >${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 24 | done 25 | 26 | # kill all by: pkill -f "inference" 27 | -------------------------------------------------------------------------------- /Modded_Open_Sora/eval/loss/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py" 4 | CKPT_PATH=$1 5 | MODEL_NAME=$2 6 | IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv" 7 | VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv" 8 | 9 | if [[ $CKPT_PATH == *"ema"* ]]; then 10 | parentdir=$(dirname $CKPT_PATH) 11 | CKPT_BASE=$(basename $parentdir)_ema 12 | else 13 | CKPT_BASE=$(basename $CKPT_PATH) 14 | fi 15 | LOG_BASE=$(dirname $CKPT_PATH)/eval 16 | mkdir -p $LOG_BASE 17 | echo "Logging to $LOG_BASE" 18 | 19 | 20 | GPUS=(3 4 5 6 7) 21 | RESOLUTION=(144p 240p 360p 480p 720p) 22 | 23 | CUDA_VISIBLE_DEVICES=0 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 0 --end-index 5 >${LOG_BASE}/img_0.log 2>&1 & 24 | CUDA_VISIBLE_DEVICES=1 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 5 --end-index 6 >${LOG_BASE}/img_1.log 2>&1 & 25 | CUDA_VISIBLE_DEVICES=2 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 6 >${LOG_BASE}/img_2.log 2>&1 & 26 | 27 | 28 | for i in "${!GPUS[@]}"; do 29 | CUDA_VISIBLE_DEVICES=${GPUS[i]} $CMD --data-path $VID_PATH --ckpt-path $CKPT_PATH --resolution ${RESOLUTION[i]} >${LOG_BASE}/${RESOLUTION[i]}_vid.log 2>&1 & 30 | done 31 | -------------------------------------------------------------------------------- /Modded_Open_Sora/eval/loss/tabulate_rl_loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | usage: 3 | python tabulate_rl_loss.py --log_dir /home/zhengzangwei/projs/Open-Sora-dev/logs/loss --ckpt_name epoch0-global_step9000 4 | 5 | save the processed json to: 6 | Open-Sora-dev/evaluation_results/rectified_flow/_loss.json 7 | """ 8 | 9 | import argparse 10 | import json 11 | import os 12 | from ast import literal_eval 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--log_dir", type=str) 18 | args = parser.parse_args() 19 | return args 20 | 21 | 22 | if __name__ == "__main__": 23 | args = parse_args() 24 | 25 | files = os.listdir(args.log_dir) 26 | files = [ 27 | "img_0.log", 28 | "img_1.log", 29 | "img_2.log", 30 | "144p_vid.log", 31 | "240p_vid.log", 32 | "360p_vid.log", 33 | "480p_vid.log", 34 | "720p_vid.log", 35 | ] 36 | 37 | loss_info = {} 38 | 39 | for fname in files: 40 | path = os.path.join(args.log_dir, fname) 41 | with open(path, "r", encoding="utf-8") as f: 42 | content = f.readlines() 43 | eval_line = content[-1].split("losses:")[-1].strip() 44 | loss_dict = literal_eval(eval_line) 45 | for key, loss in loss_dict.items(): 46 | resolution, frame = key 47 | if resolution not in loss_info: 48 | loss_info[resolution] = {} 49 | loss_info[resolution][frame] = format(loss, ".4f") 50 | 51 | # Convert and write JSON object to file 52 | output_file_path = os.path.join(args.log_dir, "loss.json") 53 | with open(output_file_path, "w") as outfile: 54 | json.dump(loss_info, outfile, indent=4, sort_keys=True) 55 | print(f"results saved to: {output_file_path}") 56 | -------------------------------------------------------------------------------- /Modded_Open_Sora/eval/vae/script/eval.sh: -------------------------------------------------------------------------------- 1 | python eval/eval_common_metric.py \ 2 | --batch_size 2 \ 3 | --real_video_dir ../test_eval/release/origin \ 4 | --generated_video_dir ../test_eval/release \ 5 | --device cuda \ 6 | --sample_fps 10 \ 7 | --crop_size 256 \ 8 | --resolution 256 \ 9 | --num_frames 17 \ 10 | --sample_rate 1 \ 11 | --subset_size 100 \ 12 | --metric ssim psnr lpips flolpips 13 | -------------------------------------------------------------------------------- /Modded_Open_Sora/eval/vbench/launch.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | CKPT=$1 4 | NUM_FRAMES=$2 5 | MODEL_NAME=$3 6 | RES=$4 7 | ASP_RATIO=$5 8 | 9 | if [[ $CKPT == *"ema"* ]]; then 10 | parentdir=$(dirname $CKPT) 11 | CKPT_BASE=$(basename $parentdir)_ema 12 | else 13 | CKPT_BASE=$(basename $CKPT) 14 | fi 15 | LOG_BASE=$(dirname $CKPT)/eval 16 | echo "Logging to $LOG_BASE" 17 | 18 | GPUS=(0 1 2 3 4 5 6 7) 19 | TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only 20 | START_INDEX_LIST=(0 120 240 360 480 600 720 840) 21 | END_INDEX_LIST=(120 240 360 480 600 720 840 2000) 22 | 23 | for i in "${!GPUS[@]}"; do 24 | if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ; 25 | then 26 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 27 | else 28 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 29 | fi 30 | done 31 | -------------------------------------------------------------------------------- /Modded_Open_Sora/eval/vbench/launch_calc.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | VIDEO_DIR=$1 4 | CKPT_DIR=$2 5 | LOG_BASE=$CKPT_DIR 6 | mkdir -p $LOG_BASE 7 | echo "Logging to $LOG_BASE" 8 | 9 | GPUS=(0 1 2 3 4 5 6 7) 10 | START_INDEX_LIST=(0 2 4 6 8 10 12 14) 11 | END_INDEX_LIST=(2 4 6 8 10 12 14 16) 12 | TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only 13 | 14 | 15 | for i in "${!GPUS[@]}"; do 16 | CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 17 | done 18 | -------------------------------------------------------------------------------- /Modded_Open_Sora/eval/vbench_i2v/json_to_txt.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | RESOLUTIONS = ["1-1", "16-9", "7-4", "8-5"] 5 | 6 | cache_root = "/mnt/jfs-hdd/sora/data/vbench-i2v/crop" 7 | resolution = RESOLUTIONS[0] 8 | json_file = "vbench2_i2v_full_info.json" 9 | save_path = "all_i2v.txt" 10 | 11 | data = json.load(open(json_file)) 12 | txt = [ 13 | f'{x["prompt_en"]}{{"reference_path": "{os.path.join(cache_root, resolution, x["image_name"])}", "mask_strategy": "0"}}' 14 | for x in data 15 | ] 16 | with open(save_path, "w") as f: 17 | f.write("\n".join(txt)) 18 | -------------------------------------------------------------------------------- /Modded_Open_Sora/eval/vbench_i2v/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT=$1 4 | NUM_FRAMES=$2 5 | MODEL_NAME=$3 6 | RES=$4 7 | ASP_RATIO=$5 8 | 9 | if [[ $CKPT == *"ema"* ]]; then 10 | parentdir=$(dirname $CKPT) 11 | CKPT_BASE=$(basename $parentdir)_ema 12 | else 13 | CKPT_BASE=$(basename $CKPT) 14 | fi 15 | LOG_BASE=$(dirname $CKPT)/eval 16 | echo "Logging to $LOG_BASE" 17 | 18 | GPUS=(0 1 2 3 4 5 6 7) 19 | TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only 20 | START_INDEX_LIST=(0 140 280 420 560 700 840 980) 21 | END_INDEX_LIST=(140 280 420 560 700 840 980 2000) 22 | 23 | for i in "${!GPUS[@]}"; do 24 | if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ; 25 | then 26 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 27 | else 28 | CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 29 | fi 30 | done 31 | -------------------------------------------------------------------------------- /Modded_Open_Sora/eval/vbench_i2v/launch_calc.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | VIDEO_DIR=$1 4 | CKPT_DIR=$2 5 | LOG_BASE=$CKPT_DIR 6 | mkdir -p $LOG_BASE 7 | echo "Logging to $LOG_BASE" 8 | 9 | GPUS=(0 1 2 3 4 5 6 7) 10 | CALC_I2V_LIST=(True True False False False False False False) 11 | CALC_QUALITY_LIST=(False False True True True True True True) 12 | START_INDEX_LIST=(0 2 0 2 3 4 5 6) 13 | END_INDEX_LIST=(2 -1 2 3 4 5 6 -1) 14 | TASK_ID_LIST=(calc_vbench_i2v_a calc_vbench_i2v_b calc_vbench_i2v_c calc_vbench_i2v_d calc_vbench_i2v_e calc_vbench_i2v_f calc_vbench_i2v_g calc_vbench_i2v_h) # for log records only 15 | 16 | 17 | for i in "${!GPUS[@]}"; do 18 | CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench_i2v/calc_vbench_i2v.py $VIDEO_DIR $CKPT_DIR --calc_i2v ${CALC_I2V_LIST[i]} --calc_quality ${CALC_QUALITY_LIST[i]} --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & 19 | done 20 | -------------------------------------------------------------------------------- /Modded_Open_Sora/gradio/requirements.txt: -------------------------------------------------------------------------------- 1 | xformers 2 | transformers 3 | git+https://github.com/hpcaitech/Open-Sora.git 4 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/opensora/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/acceleration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/opensora/acceleration/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/acceleration/checkpoint.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable 2 | 3 | import torch.nn as nn 4 | from torch.utils.checkpoint import checkpoint, checkpoint_sequential 5 | 6 | 7 | def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1): 8 | assert isinstance(model, nn.Module) 9 | 10 | def set_attr(module): 11 | module.grad_checkpointing = True 12 | module.fp32_attention = use_fp32_attention 13 | module.grad_checkpointing_step = gc_step 14 | 15 | model.apply(set_attr) 16 | 17 | 18 | def auto_grad_checkpoint(module, *args, **kwargs): 19 | if getattr(module, "grad_checkpointing", False): 20 | if not isinstance(module, Iterable): 21 | return checkpoint(module, *args, use_reentrant=False, **kwargs) 22 | gc_step = module[0].grad_checkpointing_step 23 | return checkpoint_sequential(module, gc_step, *args, use_reentrant=False, **kwargs) 24 | return module(*args, **kwargs) 25 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/acceleration/parallel_states.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as dist 2 | 3 | _GLOBAL_PARALLEL_GROUPS = dict() 4 | 5 | 6 | def set_data_parallel_group(group: dist.ProcessGroup): 7 | _GLOBAL_PARALLEL_GROUPS["data"] = group 8 | 9 | 10 | def get_data_parallel_group(): 11 | return _GLOBAL_PARALLEL_GROUPS.get("data", dist.group.WORLD) 12 | 13 | 14 | def set_sequence_parallel_group(group: dist.ProcessGroup): 15 | _GLOBAL_PARALLEL_GROUPS["sequence"] = group 16 | 17 | 18 | def get_sequence_parallel_group(): 19 | return _GLOBAL_PARALLEL_GROUPS.get("sequence", None) 20 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/acceleration/shardformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/opensora/acceleration/shardformer/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/acceleration/shardformer/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/opensora/acceleration/shardformer/modeling/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/acceleration/shardformer/policy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/opensora/acceleration/shardformer/policy/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import IMG_FPS, BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset 2 | from .utils import get_transforms_image, get_transforms_video, is_img, is_vid, save_sample 3 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .dit import * 2 | from .latte import * 3 | from .pixart import * 4 | from .stdit import * 5 | from .text_encoder import * 6 | from .vae import * 7 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/models/dit/__init__.py: -------------------------------------------------------------------------------- 1 | from .dit import DiT, DiT_XL_2, DiT_XL_2x2 2 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/models/latte/__init__.py: -------------------------------------------------------------------------------- 1 | from .latte import Latte, Latte_XL_2, Latte_XL_2x2 2 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/models/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/opensora/models/layers/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/models/pixart/__init__.py: -------------------------------------------------------------------------------- 1 | from .pixart import PixArt, PixArt_1B_2, PixArt_XL_2 2 | from .pixart_sigma import PixArt_Sigma_XL_2 3 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/models/stdit/__init__.py: -------------------------------------------------------------------------------- 1 | from .stdit import STDiT 2 | from .stdit2 import STDiT2 3 | from .stdit3 import STDiT3 4 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/models/stdit/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from einops import rearrange 3 | 4 | def isinstance_str(x: object, cls_name: str): 5 | """ 6 | Checks whether x has any class *named* cls_name in its ancestry. 7 | Doesn't require access to the class's implementation. 8 | 9 | Useful for patching! 10 | """ 11 | 12 | for _cls in x.__class__.__mro__: 13 | if _cls.__name__ == cls_name: 14 | return True 15 | 16 | return False 17 | 18 | def init_generator(device: torch.device, fallback: torch.Generator=None): 19 | """ 20 | Forks the current default random generator given device. 21 | """ 22 | if device.type == "cpu": 23 | return torch.Generator(device="cpu").set_state(torch.get_rng_state()) 24 | elif device.type == "cuda": 25 | return torch.Generator(device=device).set_state(torch.cuda.get_rng_state()) 26 | else: 27 | if fallback is None: 28 | return init_generator(torch.device("cpu")) 29 | else: 30 | return fallback 31 | 32 | def join_frame(x, fsize): 33 | """ Join multi-frame tokens """ 34 | x = rearrange(x, "(B F) N C -> B (F N) C", F=fsize) 35 | return x 36 | 37 | def split_frame(x, fsize): 38 | """ Split multi-frame tokens """ 39 | x = rearrange(x, "B (F N) C -> (B F) N C", F=fsize) 40 | return x 41 | 42 | def func_warper(funcs): 43 | """ Warp a function sequence """ 44 | def fn(x, **kwarg): 45 | for func in funcs: 46 | x = func(x, **kwarg) 47 | return x 48 | return fn 49 | 50 | def join_warper(fsize): 51 | def fn(x): 52 | x = join_frame(x, fsize) 53 | return x 54 | return fn 55 | 56 | def split_warper(fsize): 57 | def fn(x): 58 | x = split_frame(x, fsize) 59 | return x 60 | return fn -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/models/stdit_origin/__init__.py: -------------------------------------------------------------------------------- 1 | from .stdit import STDiT 2 | from .stdit2 import STDiT2 3 | from .stdit3 import STDiT3 4 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/models/text_encoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .classes import ClassEncoder 2 | from .clip import ClipEncoder 3 | from .t5 import T5Encoder 4 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/models/text_encoder/classes.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from opensora.registry import MODELS 4 | 5 | 6 | @MODELS.register_module("classes") 7 | class ClassEncoder: 8 | def __init__(self, num_classes, model_max_length=None, device="cuda", dtype=torch.float): 9 | self.num_classes = num_classes 10 | self.y_embedder = None 11 | 12 | self.model_max_length = model_max_length 13 | self.output_dim = None 14 | self.device = device 15 | 16 | def encode(self, text): 17 | return dict(y=torch.tensor([int(t) for t in text]).to(self.device)) 18 | 19 | def null(self, n): 20 | return torch.tensor([self.num_classes] * n).to(self.device) 21 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/models/vae/__init__.py: -------------------------------------------------------------------------------- 1 | from .discriminator import DISCRIMINATOR_3D 2 | from .vae import VideoAutoencoderKL, VideoAutoencoderKLTemporalDecoder 3 | from .vae_temporal import VAE_Temporal 4 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/registry.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import torch.nn as nn 4 | from mmengine.registry import Registry 5 | 6 | 7 | def build_module(module, builder, **kwargs): 8 | """Build module from config or return the module itself. 9 | 10 | Args: 11 | module (Union[dict, nn.Module]): The module to build. 12 | builder (Registry): The registry to build module. 13 | *args, **kwargs: Arguments passed to build function. 14 | 15 | Returns: 16 | Any: The built module. 17 | """ 18 | if module is None: 19 | return None 20 | if isinstance(module, dict): 21 | cfg = deepcopy(module) 22 | for k, v in kwargs.items(): 23 | cfg[k] = v 24 | return builder.build(cfg) 25 | elif isinstance(module, nn.Module): 26 | return module 27 | elif module is None: 28 | return None 29 | else: 30 | raise TypeError(f"Only support dict and nn.Module, but got {type(module)}.") 31 | 32 | 33 | MODELS = Registry( 34 | "model", 35 | locations=["opensora.models"], 36 | ) 37 | 38 | SCHEDULERS = Registry( 39 | "scheduler", 40 | locations=["opensora.schedulers"], 41 | ) 42 | 43 | DATASETS = Registry( 44 | "dataset", 45 | locations=["opensora.datasets"], 46 | ) 47 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | from .dpms import DPMS 2 | from .iddpm import IDDPM 3 | from .rf import RFLOW 4 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/schedulers/dpms/__init__.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch 4 | 5 | from opensora.registry import SCHEDULERS 6 | 7 | from .dpm_solver import DPMS 8 | 9 | 10 | @SCHEDULERS.register_module("dpm-solver") 11 | class DPM_SOLVER: 12 | def __init__(self, num_sampling_steps=None, cfg_scale=4.0): 13 | self.num_sampling_steps = num_sampling_steps 14 | self.cfg_scale = cfg_scale 15 | 16 | def sample( 17 | self, 18 | model, 19 | text_encoder, 20 | z, 21 | prompts, 22 | device, 23 | additional_args=None, 24 | mask=None, 25 | progress=True, 26 | ): 27 | assert mask is None, "mask is not supported in dpm-solver" 28 | n = len(prompts) 29 | model_args = text_encoder.encode(prompts) 30 | y = model_args.pop("y") 31 | null_y = text_encoder.null(n) 32 | if additional_args is not None: 33 | model_args.update(additional_args) 34 | 35 | dpms = DPMS( 36 | partial(forward_with_dpmsolver, model), 37 | condition=y, 38 | uncondition=null_y, 39 | cfg_scale=self.cfg_scale, 40 | model_kwargs=model_args, 41 | ) 42 | samples = dpms.sample( 43 | z, 44 | steps=self.num_sampling_steps, 45 | order=2, 46 | skip_type="time_uniform", 47 | method="multistep", 48 | progress=progress, 49 | ) 50 | return samples 51 | 52 | 53 | def forward_with_dpmsolver(self, x, timestep, y, **kwargs): 54 | """ 55 | dpm solver donnot need variance prediction 56 | """ 57 | # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb 58 | model_out = self.forward(x, timestep, y, **kwargs) 59 | return model_out.chunk(2, dim=1)[0] 60 | -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/opensora/utils/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/opensora/utils/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | from torch.optim.lr_scheduler import _LRScheduler 2 | 3 | 4 | class LinearWarmupLR(_LRScheduler): 5 | """Linearly warmup learning rate and then linearly decay. 6 | 7 | Args: 8 | optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer. 9 | warmup_steps (int, optional): Number of warmup steps, defaults to 0 10 | last_step (int, optional): The index of last step, defaults to -1. When last_step=-1, 11 | the schedule is started from the beginning or When last_step=-1, sets initial lr as lr. 12 | """ 13 | 14 | def __init__(self, optimizer, warmup_steps: int = 0, last_epoch: int = -1): 15 | self.warmup_steps = warmup_steps 16 | super().__init__(optimizer, last_epoch=last_epoch) 17 | 18 | def get_lr(self): 19 | if self.last_epoch < self.warmup_steps: 20 | return [(self.last_epoch + 1) / (self.warmup_steps + 1) * lr for lr in self.base_lrs] 21 | else: 22 | return self.base_lrs 23 | -------------------------------------------------------------------------------- /Modded_Open_Sora/requirements/requirements-cu121.txt: -------------------------------------------------------------------------------- 1 | torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121 2 | torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu121 3 | xformers==0.0.26.post1 --index-url https://download.pytorch.org/whl/cu121 4 | -------------------------------------------------------------------------------- /Modded_Open_Sora/requirements/requirements-data.txt: -------------------------------------------------------------------------------- 1 | gdown>=5.2.0 2 | 3 | # [caption llava] 4 | ninja>=1.11.1.1 5 | shortuuid>=1.0.13 6 | markdown2[all] 7 | scikit-learn>=1.4.2 8 | einops-exts>=0.0.4 9 | 10 | # [camera_motion] 11 | decord==0.6.0 12 | ptvsd==4.3.2 13 | imageio-ffmpeg>=0.4.9 14 | 15 | # [datasets] 16 | ffmpeg-python==0.2.0 17 | lingua-language-detector==2.0.2 18 | 19 | # [frame interpolation] 20 | imageio>=2.34.1 21 | 22 | # [aesthetic] 23 | setuptools==68.2.2 24 | clip @ git+https://github.com/openai/CLIP.git 25 | 26 | # [ocr] 27 | mmcv==2.1.0 28 | mmdet==3.1.0 29 | mmocr==1.0.1 30 | detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992 31 | -------------------------------------------------------------------------------- /Modded_Open_Sora/requirements/requirements-eval.txt: -------------------------------------------------------------------------------- 1 | # [vbench] 2 | detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992 3 | imageio>=2.34.1 4 | pyiqa==0.1.10 5 | scikit-learn>=1.4.2 6 | scikit-image>=0.20.0 7 | lvis==0.5.3 8 | boto3>=1.34.113 9 | easydict>=1.9 10 | fairscale>=0.4.13 11 | 12 | # [vae] 13 | decord==0.6.0 14 | pytorchvideo==0.1.5 15 | lpips==0.1.4 16 | -------------------------------------------------------------------------------- /Modded_Open_Sora/requirements/requirements-vae.txt: -------------------------------------------------------------------------------- 1 | beartype==0.18.5 2 | einops==0.8.0 3 | einops-exts==0.0.4 4 | opencv-python==4.9.0.80 5 | pillow==10.3.0 6 | -------------------------------------------------------------------------------- /Modded_Open_Sora/requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai==0.3.7 2 | mmengine>=0.10.3 3 | pandas>=2.0.3 4 | timm==0.9.16 5 | rotary_embedding_torch==0.5.3 6 | ftfy>=6.2.0 # for t5 7 | diffusers==0.27.2 # for vae 8 | accelerate==0.29.2 # for t5 9 | av>=12.0.0 # for video loading 10 | 11 | # [gradio] 12 | gradio>=4.26.0 13 | spaces>=0.28.3 14 | 15 | # [notebook] 16 | ipykernel>=6.29.4 17 | ipywidgets>=8.1.2 18 | 19 | # [training] 20 | wandb>=0.17.0 21 | tensorboard>=2.14.0 22 | pandarallel>=1.6.5 23 | pyarrow>=16.1.0 # for parquet 24 | 25 | # [dev] 26 | pre-commit>=3.5.0 27 | openai 28 | -------------------------------------------------------------------------------- /Modded_Open_Sora/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # CUDA_VISIBLE_DEVICES=3 python scripts/inference.py configs/opensora-v1-2/inference/test_config.py --prompt "zoom in video of a tram passing by city" 5 | python scripts/inference.py configs/opensora-v1-2/inference/test_config.py "$@" 6 | # python scripts/inference.py configs/opensora-v1-2/inference/test_config.py --prompt "A scene of a border collie running through a field in broad daylight" 7 | 8 | # python freqmetric/gif_maker.py 9 | -------------------------------------------------------------------------------- /Modded_Open_Sora/scripts/misc/launch_extract_feat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | START_SPLIT=0 7 | NUM_SPLIT=10 8 | 9 | DATA_PATH=$1 10 | SAVE_PATH=$2 11 | DATA_ARG="--data-path $DATA_PATH" 12 | SAVE_ARG="--save-dir $SAVE_PATH" 13 | 14 | CMD="torchrun --standalone --nproc_per_node 1 scripts/misc/extract_feat.py configs/opensora-v1-2/misc/extract.py $DATA_ARG $SAVE_ARG" 15 | declare -a GPUS=(0 1 2 3 4 5 6 7) 16 | 17 | mkdir -p logs/extract_feat 18 | 19 | for i in "${GPUS[@]}"; do 20 | CUDA_VISIBLE_DEVICES=$i $CMD --start-index $(($START_SPLIT + i * $NUM_SPLIT)) --end-index $(($START_SPLIT + (i + 1) * $NUM_SPLIT)) >logs/extract_feat/$i.log 2>&1 & 21 | done 22 | -------------------------------------------------------------------------------- /Modded_Open_Sora/scripts/misc/launch_search_bs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | CMD="torchrun --standalone --nproc_per_node 1 scripts/misc/search_bs.py configs/opensora-v1-2/misc/bs.py" 7 | DATA_PATH="/mnt/nfs-207/sora_data/meta/searchbs.csv" 8 | 9 | LOG_BASE=logs/search_bs 10 | mkdir -p logs/search_bs 11 | echo "Logging to $LOG_BASE" 12 | 13 | CUDA_VISIBLE_DEVICES=0 $CMD --data-path $DATA_PATH --resolution 144p >${LOG_BASE}/144p.log 2>&1 & 14 | CUDA_VISIBLE_DEVICES=1 $CMD --data-path $DATA_PATH --resolution 240p >${LOG_BASE}/240p.log 2>&1 & 15 | CUDA_VISIBLE_DEVICES=2 $CMD --data-path $DATA_PATH --resolution 512 >${LOG_BASE}/512.log 2>&1 & 16 | CUDA_VISIBLE_DEVICES=3 $CMD --data-path $DATA_PATH --resolution 480p >${LOG_BASE}/480p.log 2>&1 & 17 | CUDA_VISIBLE_DEVICES=4 $CMD --data-path $DATA_PATH --resolution 1024 >${LOG_BASE}/1024.log 2>&1 & 18 | CUDA_VISIBLE_DEVICES=5 $CMD --data-path $DATA_PATH --resolution 1080p >${LOG_BASE}/1080p.log 2>&1 & 19 | CUDA_VISIBLE_DEVICES=6 $CMD --data-path $DATA_PATH --resolution 2048 >${LOG_BASE}/2048.log 2>&1 & 20 | -------------------------------------------------------------------------------- /Modded_Open_Sora/tests/test_attn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from colossalai.accelerator import get_accelerator 3 | from colossalai.utils import get_current_device 4 | from rotary_embedding_torch import RotaryEmbedding 5 | 6 | from opensora.models.layers.blocks import Attention 7 | 8 | # B, S, H = 7488, 1, 1152 9 | # B, S, H = 32, 234, 1152 10 | B, S, H = 128, 32, 1152 11 | N, D = 16, 72 12 | 13 | 14 | def run_attn(enable_flash_attn: bool): 15 | get_accelerator().reset_peak_memory_stats() 16 | rope = RotaryEmbedding(D).to(device=get_current_device(), dtype=torch.bfloat16) 17 | attn = Attention( 18 | H, 19 | N, 20 | qkv_bias=True, 21 | rope=rope.rotate_queries_or_keys, 22 | enable_flash_attn=enable_flash_attn, 23 | ).to(device=get_current_device(), dtype=torch.bfloat16) 24 | x = torch.randn(B, S, H, device=get_current_device(), dtype=torch.bfloat16).requires_grad_() 25 | y = attn(x) 26 | y.mean().backward() 27 | print(f"Peak memory: {get_accelerator().max_memory_allocated() / 1024**2:.2f} MB") 28 | 29 | 30 | if __name__ == "__main__": 31 | print("Use flashattn") 32 | run_attn(True) 33 | print("No flashattn") 34 | run_attn(False) 35 | -------------------------------------------------------------------------------- /Modded_Open_Sora/tests/test_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | from torchvision.models import resnet50 4 | from tqdm import tqdm 5 | 6 | from opensora.utils.lr_scheduler import LinearWarmupLR 7 | 8 | 9 | def test_lr_scheduler(): 10 | warmup_steps = 200 11 | model = resnet50().cuda() 12 | optimizer = Adam(model.parameters(), lr=0.01) 13 | scheduler = LinearWarmupLR(optimizer, warmup_steps=warmup_steps) 14 | current_lr = scheduler.get_lr()[0] 15 | data = torch.rand(1, 3, 224, 224).cuda() 16 | 17 | for i in tqdm(range(warmup_steps * 2)): 18 | out = model(data) 19 | out.mean().backward() 20 | optimizer.step() 21 | scheduler.step() 22 | 23 | if i >= warmup_steps: 24 | assert scheduler.get_lr()[0] == 0.01 25 | else: 26 | assert scheduler.get_lr()[0] > current_lr, f"{scheduler.get_lr()[0]} <= {current_lr}" 27 | current_lr = scheduler.get_lr()[0] 28 | 29 | 30 | if __name__ == "__main__": 31 | test_lr_scheduler() 32 | -------------------------------------------------------------------------------- /Modded_Open_Sora/tests/test_pos_emb.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from opensora.models.layers.blocks import PositionEmbedding2D, get_2d_sincos_pos_embed 5 | 6 | D = 8 7 | SCALE = 2.0 8 | from torch.testing import assert_close 9 | 10 | 11 | def get_spatial_pos_embed(x, hidden_size, h, w, scale, base_size=None): 12 | pos_embed = get_2d_sincos_pos_embed( 13 | hidden_size, 14 | (h, w), 15 | scale=scale, 16 | base_size=base_size, 17 | ) 18 | pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) 19 | return pos_embed.to(device=x.device, dtype=x.dtype) 20 | 21 | 22 | @pytest.mark.parametrize("dtype", [torch.float, torch.float16]) 23 | @pytest.mark.parametrize("device", ["cpu", "cuda"]) 24 | def test_pos_emb(dtype, device): 25 | # just a placeholder to get the device and dtype 26 | x = torch.empty(1, dtype=dtype, device=device) 27 | pos_embedder = PositionEmbedding2D( 28 | D, 29 | max_position_embeddings=8, 30 | scale=SCALE, 31 | ).to(device=device, dtype=dtype) 32 | output = pos_embedder(x, 8, 7) 33 | target = get_spatial_pos_embed(x, D, 8, 7, SCALE) 34 | assert_close(output, target) 35 | output = pos_embedder(x, 15, 16) 36 | target = get_spatial_pos_embed(x, D, 15, 16, SCALE) 37 | assert_close(output, target) 38 | output = pos_embedder(x, 30, 20, base_size=2) 39 | target = get_spatial_pos_embed(x, D, 30, 20, SCALE, base_size=2) 40 | assert_close(output, target) 41 | # test cache 42 | output = pos_embedder(x, 30, 20, base_size=2) 43 | target = get_spatial_pos_embed(x, D, 30, 20, SCALE, base_size=2) 44 | assert_close(output, target) 45 | assert pos_embedder._get_cached_emb.cache_info().hits >= 1 46 | -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/caption/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/caption/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/caption/acceleration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/caption/acceleration/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/caption/acceleration/llava/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/caption/acceleration/llava/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/caption/acceleration/llava/policies/__init__.py: -------------------------------------------------------------------------------- 1 | from .llama import LlavaLlamaForCausalLMPolicy 2 | from .mistral import LlavaMistralForCausalLMPolicy 3 | -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/caption/camera_motion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/caption/camera_motion/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/caption/camera_motion/detect.py: -------------------------------------------------------------------------------- 1 | # Originally developed by https://github.com/Vchitect/VBench based on https://github.com/facebookresearch/co-tracker. 2 | 3 | import argparse 4 | from typing import List 5 | 6 | import pandas as pd 7 | 8 | from .camera_motion import compute_camera_motion 9 | 10 | 11 | def process(paths: List[str], threshold: float) -> List[str]: 12 | device = "cuda" 13 | submodules = {"repo": "facebookresearch/co-tracker", "model": "cotracker2"} 14 | camera_motion_types = compute_camera_motion(device, submodules, paths, factor=threshold) 15 | return camera_motion_types 16 | 17 | 18 | def main(args): 19 | output_file = args.input.replace(".csv", "_cmotion.csv") 20 | data = pd.read_csv(args.input) 21 | data["cmotion"] = process(data["path"], args.threshold) 22 | data.to_csv(output_file, index=False) 23 | print(f"Output saved to {output_file}") 24 | 25 | 26 | if __name__ == "__main__": 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("input", type=str) 29 | parser.add_argument("--threshold", type=float, default=0.25) 30 | args = parser.parse_args() 31 | main(args) 32 | -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/caption/camera_motion/requirements.txt: -------------------------------------------------------------------------------- 1 | decord 2 | ptvsd 3 | imageio-ffmpeg 4 | -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/datasets/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/frame_interpolation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/frame_interpolation/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/frame_interpolation/networks/__init__.py: -------------------------------------------------------------------------------- 1 | from .amt_g import Model 2 | -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/frame_interpolation/networks/blocks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/frame_interpolation/networks/blocks/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/frame_interpolation/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/frame_interpolation/utils/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/frame_interpolation/utils/dist_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | 6 | def get_world_size(): 7 | """Find OMPI world size without calling mpi functions 8 | :rtype: int 9 | """ 10 | if os.environ.get("PMI_SIZE") is not None: 11 | return int(os.environ.get("PMI_SIZE") or 1) 12 | elif os.environ.get("OMPI_COMM_WORLD_SIZE") is not None: 13 | return int(os.environ.get("OMPI_COMM_WORLD_SIZE") or 1) 14 | else: 15 | return torch.cuda.device_count() 16 | 17 | 18 | def get_global_rank(): 19 | """Find OMPI world rank without calling mpi functions 20 | :rtype: int 21 | """ 22 | if os.environ.get("PMI_RANK") is not None: 23 | return int(os.environ.get("PMI_RANK") or 0) 24 | elif os.environ.get("OMPI_COMM_WORLD_RANK") is not None: 25 | return int(os.environ.get("OMPI_COMM_WORLD_RANK") or 0) 26 | else: 27 | return 0 28 | 29 | 30 | def get_local_rank(): 31 | """Find OMPI local rank without calling mpi functions 32 | :rtype: int 33 | """ 34 | if os.environ.get("MPI_LOCALRANKID") is not None: 35 | return int(os.environ.get("MPI_LOCALRANKID") or 0) 36 | elif os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") is not None: 37 | return int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") or 0) 38 | else: 39 | return 0 40 | 41 | 42 | def get_master_ip(): 43 | if os.environ.get("AZ_BATCH_MASTER_NODE") is not None: 44 | return os.environ.get("AZ_BATCH_MASTER_NODE").split(":")[0] 45 | elif os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") is not None: 46 | return os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") 47 | else: 48 | return "127.0.0.1" 49 | -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/scene_cut/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/scene_cut/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/scoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/scoring/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/scoring/aesthetic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/scoring/aesthetic/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/scoring/matching/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/scoring/matching/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/scoring/ocr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/scoring/ocr/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/scoring/optical_flow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eai-lab/On-device-Sora/e195319158a5fa74e6b85bfc4f810dd5294bf49e/Modded_Open_Sora/tools/scoring/optical_flow/__init__.py -------------------------------------------------------------------------------- /Modded_Open_Sora/tools/scoring/optical_flow/unimatch/__init__.py: -------------------------------------------------------------------------------- 1 | from .unimatch import UniMatch 2 | -------------------------------------------------------------------------------- /On-device/On-device-Sora.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /On-device/On-device-Sora.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "originHash" : "2f6dd5a54053710d3b7873549f15132c3e0967a00edc7d88c353abb4a237dfec", 3 | "pins" : [ 4 | { 5 | "identity" : "swift-argument-parser", 6 | "kind" : "remoteSourceControl", 7 | "location" : "https://github.com/apple/swift-argument-parser.git", 8 | "state" : { 9 | "revision" : "41982a3656a71c768319979febd796c6fd111d5c", 10 | "version" : "1.5.0" 11 | } 12 | }, 13 | { 14 | "identity" : "swift-transformers", 15 | "kind" : "remoteSourceControl", 16 | "location" : "https://github.com/huggingface/swift-transformers.git", 17 | "state" : { 18 | "revision" : "e72d032ed742dcc8b364780ce4e02b25ab7a09b0", 19 | "version" : "0.1.9" 20 | } 21 | } 22 | ], 23 | "version" : 3 24 | } 25 | -------------------------------------------------------------------------------- /On-device/On-device-Sora/Assets.xcassets/AccentColor.colorset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "colors" : [ 3 | { 4 | "idiom" : "universal" 5 | } 6 | ], 7 | "info" : { 8 | "author" : "xcode", 9 | "version" : 1 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /On-device/On-device-Sora/Assets.xcassets/AppIcon.appiconset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "images" : [ 3 | { 4 | "idiom" : "universal", 5 | "platform" : "ios", 6 | "size" : "1024x1024" 7 | }, 8 | { 9 | "appearances" : [ 10 | { 11 | "appearance" : "luminosity", 12 | "value" : "dark" 13 | } 14 | ], 15 | "idiom" : "universal", 16 | "platform" : "ios", 17 | "size" : "1024x1024" 18 | }, 19 | { 20 | "appearances" : [ 21 | { 22 | "appearance" : "luminosity", 23 | "value" : "tinted" 24 | } 25 | ], 26 | "idiom" : "universal", 27 | "platform" : "ios", 28 | "size" : "1024x1024" 29 | } 30 | ], 31 | "info" : { 32 | "author" : "xcode", 33 | "version" : 1 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /On-device/On-device-Sora/Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /On-device/On-device-Sora/ManagedMLModel.swift: -------------------------------------------------------------------------------- 1 | import CoreML 2 | 3 | public final class ManagedMLModel { 4 | var modelURL: URL 5 | 6 | var config: MLModelConfiguration 7 | 8 | var loadedModel: MLModel? 9 | 10 | var queue: DispatchQueue 11 | 12 | public init(modelURL: URL, config: MLModelConfiguration) { 13 | self.modelURL = modelURL 14 | self.config = config 15 | self.loadedModel = nil 16 | self.queue = DispatchQueue(label: "managed.\(modelURL.lastPathComponent)") 17 | } 18 | 19 | public func loadResources() throws { 20 | try queue.sync { 21 | try loadModel() 22 | } 23 | } 24 | 25 | public func unloadResources() { 26 | queue.sync { 27 | loadedModel = nil 28 | } 29 | } 30 | 31 | public func perform(_ body: (MLModel) throws -> R) throws -> R { 32 | return try queue.sync { 33 | try autoreleasepool { 34 | try loadModel() 35 | return try body(loadedModel!) 36 | } 37 | } 38 | } 39 | 40 | private func loadModel() throws { 41 | if loadedModel == nil { 42 | loadedModel = try MLModel(contentsOf: modelURL, configuration: config) 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /On-device/On-device-Sora/MemoryInfo.swift: -------------------------------------------------------------------------------- 1 | public class MemoryInfo { 2 | var beforeMemory: Int 3 | var afterMemory: Int 4 | var needMemory: Int 5 | var loadMemory: Int 6 | var remainMemory: Int 7 | var countOfUnload: Int 8 | 9 | init(beforeMemory: Int, afterMemory: Int, needMemory: Int, loadMemory: Int, remainMemory: Int, countOfUnload: Int) { 10 | self.beforeMemory = beforeMemory 11 | self.afterMemory = afterMemory 12 | self.needMemory = needMemory 13 | self.loadMemory = loadMemory 14 | self.remainMemory = remainMemory 15 | self.countOfUnload = countOfUnload 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /On-device/On-device-Sora/On_device_SoraApp.swift: -------------------------------------------------------------------------------- 1 | import SwiftUI 2 | 3 | @main 4 | struct On_device_SoraApp: App { 5 | var body: some Scene { 6 | WindowGroup { 7 | ContentView() 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /On-device/On-device-Sora/Preview Content/Preview Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /On-device/On-device-Sora/RFlowScheduler.swift: -------------------------------------------------------------------------------- 1 | import Accelerate 2 | import CoreML 3 | 4 | public final class RFlowScheduler { 5 | public let numTimesteps: Int 6 | 7 | public init(numTimesteps: Int) { 8 | self.numTimesteps = numTimesteps 9 | } 10 | 11 | public func addNoise(original_samples:MLTensor, noise: MLTensor, timesteps: Float32) -> MLTensor { 12 | var timepoints = MLTensor([1.0 - (Float32(timesteps) / Float32(self.numTimesteps))]) 13 | 14 | timepoints = timepoints.expandingShape(at: 1).expandingShape(at: 1).expandingShape(at: 1).expandingShape(at: 1) 15 | timepoints = timepoints.tiled(multiples: [noise.shape[1], noise.shape[2], noise.shape[3], noise.shape[4]]) 16 | return timepoints * original_samples + (1.0 - timepoints) * noise 17 | 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /On-device/On-device-Sora/T5Tokenizer.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import Hub 3 | import Tokenizers 4 | 5 | /// Extension to swift-transfomers Hub.swift to load local Config files 6 | public extension Config { 7 | /// Assumes the file is already present at local url. 8 | /// `fileURL` is a complete local file path for the given model 9 | public init(fileURL: URL) throws { 10 | let data = try Data(contentsOf: fileURL) 11 | let parsed = try JSONSerialization.jsonObject(with: data, options: []) 12 | guard var dictionary = parsed as? [String: Any] else { throw Hub.HubClientError.parse } 13 | 14 | // Necessary override for loading local tokenizer configs 15 | dictionary["tokenizer_class"] = "T5Tokenizer" 16 | self.init(dictionary) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /On-device/On-device-Sora/VideoPlayerView.swift: -------------------------------------------------------------------------------- 1 | import SwiftUI 2 | import AVKit 3 | 4 | struct VideoPlayerView: View { 5 | let url: URL 6 | 7 | var body: some View { 8 | VideoPlayer(player: AVPlayer(url: url)) 9 | .aspectRatio(contentMode: .fit) 10 | .onAppear { 11 | // run the video player 12 | AVPlayer(url: url).play() 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /On-device/README.md: -------------------------------------------------------------------------------- 1 | # On-Device-Sora 2 | 3 | ### Required 4 | * Mac device for xcode 5 | * Apple Account to build and launch the app 6 | * iPhonne: over iPhone 15 pro 7 | * iOS version: over 18 8 | * All MLPackage (T5, STDiT, VAE) 9 | 10 | ### Download converted MLPackage (if you don't want to convert each model to MLPackage) 11 | 12 | You can download and use the converted models from the following link. [[Download](https://drive.google.com/drive/folders/1L6pVi3KmyLygR_pvKofRL-21adKsEb4p?usp=sharing)] 13 | 14 | ### Run the app 15 | * Implement xcode project by clicking On-device/On-device-Sora.xcodeproj 16 | * Change the Team (None -> Your Apple account) in TARGETS/Signing&Capabilities 17 | * Launch the app 18 | --------------------------------------------------------------------------------