├── .github └── ISSUE_TEMPLATE │ ├── bug-report.md │ └── feature_request.md ├── .gitignore ├── LICENSE ├── README.md └── vision ├── Taichu-GLIDE ├── README.md ├── data │ └── prompts.txt ├── model │ ├── glide_text2im │ │ ├── __init__.py │ │ ├── custom_types.py │ │ ├── default_options.py │ │ ├── diffusion_creator.py │ │ ├── gaussian_computation.py │ │ ├── losses.py │ │ ├── main_funcs.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── gaussian_diffusion.py │ │ │ ├── guider.py │ │ │ ├── simple_nn.py │ │ │ ├── srgan.py │ │ │ ├── srgan_util.py │ │ │ ├── text2im_model.py │ │ │ ├── train_model.py │ │ │ ├── unet.py │ │ │ └── xf.py │ │ ├── model_creation.py │ │ ├── model_creator.py │ │ ├── tokenizer │ │ │ ├── __init__.py │ │ │ ├── bpe.py │ │ │ ├── caption_to_tokens.py │ │ │ └── chinese_tokenizer.py │ │ └── train │ │ │ ├── Loader.py │ │ │ ├── __init__.py │ │ │ ├── build_optimizer.py │ │ │ ├── cell_wrapper.py │ │ │ ├── config.py │ │ │ ├── config.yml │ │ │ ├── data_loader.py │ │ │ ├── generator.py │ │ │ ├── image_datasets.py │ │ │ ├── logger.py │ │ │ ├── parallel_transformer.py │ │ │ ├── resample.py │ │ │ ├── sampler.py │ │ │ ├── t2ids.py │ │ │ └── train_util.py │ └── glide_utils │ │ ├── __init__.py │ │ ├── callbackConfig.py │ │ ├── img_utils.py │ │ ├── learn_utils.py │ │ ├── moxing_adapter.py │ │ ├── parallelConfig.py │ │ └── parallel_utils.py ├── model_configs │ ├── model_config.json │ └── supres_model_config.json ├── requirements.txt ├── scripts │ ├── run_gen_finetune_dist.sh │ ├── run_infer.sh │ └── run_super_res_finetune_dist.sh └── src │ ├── train_txt2img.py │ └── txt2img.py ├── stablediffusionv2 ├── README.md ├── __init__.py ├── configs │ ├── train_config.json │ ├── v2-inference.yaml │ └── v2-train.yaml ├── demo │ ├── city1.png │ ├── city2.png │ ├── horse1.png │ ├── horse2.png │ ├── sunflower1.png │ ├── sunflower2.png │ ├── tree1.png │ └── tree2.png ├── ldm │ ├── data │ │ ├── __init__.py │ │ ├── dataset.py │ │ └── t2i_collate.py │ ├── models │ │ ├── autoencoder.py │ │ ├── clip_zh │ │ │ ├── __init__.py │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ └── simple_tokenizer.py │ │ └── diffusion │ │ │ ├── __init__.py │ │ │ ├── ddpm.py │ │ │ ├── dpm_solver │ │ │ ├── __init__.py │ │ │ ├── dpm_solver.py │ │ │ └── sampler.py │ │ │ └── plms.py │ ├── modules │ │ ├── attention.py │ │ ├── diffusionmodules │ │ │ ├── __init__.py │ │ │ ├── model.py │ │ │ ├── openaimodel.py │ │ │ ├── upscaling.py │ │ │ └── util.py │ │ ├── distributions │ │ │ ├── __init__.py │ │ │ └── distributions.py │ │ ├── encoders │ │ │ ├── __init__.py │ │ │ ├── modules.py │ │ │ └── text_encoder.py │ │ └── train │ │ │ ├── callback.py │ │ │ ├── cell_wrapper.py │ │ │ ├── learningrate.py │ │ │ ├── optim.py │ │ │ ├── parallel_config.py │ │ │ ├── tools.py │ │ │ └── utils.py │ └── util.py ├── requirements.txt ├── run_train.py ├── scripts │ └── infer.sh └── txt2img.py └── wukong-huahua ├── README.md ├── README_EN.md ├── configs ├── clip-vit-l-14-zh │ └── config.json ├── train_config.json ├── train_db_config.json ├── v1-inference-chinese-lora.yaml ├── v1-inference-chinese.yaml ├── v1-train-chinese-lora.yaml ├── v1-train-chinese.yaml ├── v1-train-db-chinese.yaml └── wukong-huahua_inpaint_inference.yaml ├── demo ├── inpaint │ ├── overture-creations-5sI6fQgYIuo.png │ ├── overture-creations-5sI6fQgYIuo_mask.png │ └── 一只红色的狐狸坐在长椅上.png ├── 个性化生成效果-猫.jpg ├── 个性化训练数据-猫.jpg ├── 乡村 田野 屏保.png ├── 城市夜景 赛博朋克 格雷格·鲁特科夫斯基.png ├── 效果图合集.jpg ├── 时空 黑洞 辐射.png ├── 来自深渊 风景 绘画 写实风格.png ├── 海上日出时候的奔跑者.png ├── 莫奈 撑阳伞的女人 月亮 梦幻.png └── 诺亚方舟在世界末日起航 科幻插画.png ├── inpaint.py ├── ldm ├── data │ ├── dataset.py │ ├── dataset_db.py │ └── t2i_collate.py ├── models │ ├── autoencoder.py │ ├── clip_zh │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── simple_tokenizer.py │ │ ├── utils.py │ │ └── vocab_zh.txt │ └── diffusion │ │ ├── ddpm.py │ │ ├── dpm_solver │ │ ├── __init__.py │ │ ├── dpm_solver.py │ │ └── sampler.py │ │ └── plms.py ├── modules │ ├── attention.py │ ├── diffusionmodules │ │ ├── model.py │ │ ├── openaimodel.py │ │ └── util.py │ ├── distributions │ │ └── distributions.py │ ├── encoders │ │ ├── modules.py │ │ └── text_encoder.py │ └── train │ │ ├── callback.py │ │ ├── cell_wrapper.py │ │ ├── learningrate.py │ │ ├── optim.py │ │ ├── parallel_config.py │ │ ├── tools.py │ │ └── utils.py └── util.py ├── requirements.txt ├── run_db_train.py ├── run_train.py ├── scripts ├── run_db_train.sh ├── run_inpaint.sh ├── run_train.sh ├── run_train_lora.sh ├── run_train_parallel.sh ├── run_train_parallel_lora.sh ├── run_txt2img.sh └── run_txt2img_lora.sh └── txt2img.py /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: BUG report 3 | about: BUG反馈 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | If this is your first time, please read our contributor guidelines: 11 | https://github.com/mindspore-lab/mindcv/blob/main/CONTRIBUTING.md 12 | 13 | **Describe the bug/ 问题描述 (Mandatory / 必填)** 14 | A clear and concise description of what the bug is. 15 | 16 | - **Hardware Environment(`Ascend`/`GPU`/`CPU`) / 硬件环境**: 17 | > Please delete the backend not involved / 请删除不涉及的后端: 18 | > /device ascend/GPU/CPU/kirin/等其他芯片 19 | 20 | - **Software Environment / 软件环境 (Mandatory / 必填)**: 21 | -- MindSpore version (e.g., 1.7.0.Bxxx) : 22 | -- Python version (e.g., Python 3.7.5) : 23 | -- OS platform and distribution (e.g., Linux Ubuntu 16.04): 24 | -- GCC/Compiler version (if compiled from source): 25 | 26 | - **Excute Mode / 执行模式 (Mandatory / 必填)(`PyNative`/`Graph`)**: 27 | > Please delete the mode not involved / 请删除不涉及的模式: 28 | > /mode pynative 29 | > /mode graph 30 | 31 | **To Reproduce / 重现步骤 (Mandatory / 必填)** 32 | Steps to reproduce the behavior: 33 | 1. Go to '...' 34 | 2. Click on '....' 35 | 3. Scroll down to '....' 36 | 4. See error 37 | 38 | **Expected behavior / 预期结果 (Mandatory / 必填)** 39 | A clear and concise description of what you expected to happen. 40 | 41 | **Screenshots/ 日志 / 截图 (Mandatory / 必填)** 42 | If applicable, add screenshots to help explain your problem. 43 | 44 | **Additional context / 备注 (Optional / 选填)** 45 | Add any other context about the problem here. 46 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: 需求特性反馈 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | If this is your first time, please read our contributor guidelines: https://gitee.com/mindspore/mindspore/blob/master/CONTRIBUTING.md 11 | 12 | **Is your feature request related to a problem? Please describe.** 13 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 14 | 15 | **Describe the solution you'd like** 16 | A clear and concise description of what you want to happen. 17 | 18 | **Describe alternatives you've considered** 19 | A clear and concise description of any alternative solutions or features you've considered. 20 | 21 | **Additional context** 22 | Add any other context or screenshots about the feature request here. 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # obsidian 132 | .obsidian/ 133 | 134 | *.ckpt 135 | rank_? 136 | output 137 | .DS_Store 138 | 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## NEWS 3 | 4 | minddiffusion is no longer updated. 5 | All diffusion models and generative models will be provided in new repo mindone 6 | https://github.com/mindspore-lab/mindone 7 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/README.md: -------------------------------------------------------------------------------- 1 | # Taichu-GLIDE 2 | ## 模型介绍 3 | Taichu-GLIDE是**华为昇腾计算**携手**武汉人工智能研究院**、**中科院自动化所**基于昇腾昇思全栈开发的中文文生图大模型(紫东.太初系列模型之一),该模型采用了AIGC领域当前非常流行的扩散模型(Diffusion Model)技术,代码和预训练模型权重均对外进行开源,开发者可使用本仓进行以文生图任务的体验。 4 | 5 | 6 | ![一幅画着柯基的油画](https://user-images.githubusercontent.com/17930313/206085057-e079d90a-3313-4b9a-9e1c-f67a0594245d.png) 7 | **                                                                                                 一幅画着柯基的油画** 8 | 9 | ## 环境要求 10 | 11 | 1. **安装 CANN(5.1.RC2 版本)及其配套版本的驱动(driver)和 固件(firemware)** \ 12 | 前往昇腾社区下载安装包:\ 13 | \ 14 | 以arm + 欧拉的系统配置为例 (x86的系统请选择x86的包) 15 | 16 | 2. **安装 MindSpore 1.8.1 版本** \ 17 | 前往MindSpore官网,按照教程安装对应版本,链接如下: \ 18 | 19 | 20 | 3. **安装 requirements 依赖** \ 21 | pip install -r requirements.txt 22 | 23 | ## 快速体验 24 | 25 | ### 推理 26 | - 请先[点击此处](https://download.mindspore.cn/toolkits/minddiffusion/Taichu-GLIDE/)下载ckpt文件 27 | - 在data/prompts.txt添加自己想要生成的prompt 28 | - 修改 scripts/run_infer.sh中相关路径及配置 29 | ```bash 30 | bash scripts/run_infer.sh 31 | ``` 32 | ### 训练 33 | 34 | ```bash 35 | # 生成阶段分布式训练 36 | bash scripts/run_gen_finetune_dist.sh /path/hccl_xp_xxxx.json [DEVICE_NUM] [DEVICE_START] 37 | ``` 38 | 39 | ```bash 40 | # 超分阶段分布式训练 41 | bash scripts/run_super_res_finetune_dist.sh /path/hccl_xp_xxxx.json [DEVICE_NUM] [DEVICE_START] 42 | ``` 43 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/data/prompts.txt: -------------------------------------------------------------------------------- 1 | 一张画着柯基的油画 2 | 一只可爱的猫坐在草地上 -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | A codebase for performing model inference with a text-conditional diffusion model. 3 | """ 4 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/custom_types.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import enum 17 | 18 | 19 | class ModelMeanType(enum.Enum): 20 | """ 21 | Which type of output the model predicts. 22 | """ 23 | 24 | PREVIOUS_X = enum.auto() # the model predicts x_{t-1} 25 | START_X = enum.auto() # the model predicts x_0 26 | EPSILON = enum.auto() # the model predicts epsilon 27 | 28 | 29 | class ModelVarType(enum.Enum): 30 | """ 31 | What is used as the model's output variance. 32 | 33 | The LEARNED_RANGE option has been added to allow the model to predict 34 | values between FIXED_SMALL and FIXED_LARGE, making its job easier. 35 | """ 36 | 37 | LEARNED = enum.auto() 38 | FIXED_SMALL = enum.auto() 39 | FIXED_LARGE = enum.auto() 40 | LEARNED_RANGE = enum.auto() 41 | 42 | 43 | class LossType(enum.Enum): 44 | MSE = enum.auto() # use raw MSE loss (and KL when learning variances) 45 | BALANCED_MSE = enum.auto() 46 | RESCALED_MSE = ( 47 | enum.auto() 48 | ) # use raw MSE loss (with RESCALED_KL when learning variances) 49 | KL = enum.auto() # use the variational lower-bound 50 | RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB 51 | 52 | def is_vb(self): 53 | return self == LossType.KL or self == LossType.RESCALED_KL 54 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/default_options.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import mindspore 17 | 18 | 19 | def model_and_diffusion_defaults( 20 | image_size=64, 21 | num_channels=192, 22 | num_res_blocks=3, 23 | channel_mult=(1, 2, 3, 4), 24 | num_heads=1, 25 | num_head_channels=64, 26 | num_heads_upsample=-1, 27 | attention_resolutions=tuple([2, 4, 8]), 28 | dropout=0.9, 29 | text_ctx=128, 30 | xf_width=512, 31 | xf_layers=16, 32 | xf_heads=8, 33 | xf_final_ln=True, 34 | n_vocab=50001, 35 | xf_padding=True, 36 | diffusion_steps=1000, 37 | noise_schedule="squaredcos_cap_v2", 38 | timestep_respacing="60", 39 | use_scale_shift_norm=True, 40 | resblock_updown=True, 41 | use_fp16=True, 42 | cache_text_emb=False, 43 | inpaint=False, 44 | super_res=False, 45 | chinese=True, 46 | sketch=False, 47 | class_balanced=False, 48 | sketch_classes=0, 49 | dtype=mindspore.float32 50 | ): 51 | return dict( 52 | image_size=image_size, 53 | num_channels=num_channels, 54 | num_res_blocks=num_res_blocks, 55 | channel_mult=channel_mult, 56 | num_heads=num_heads, 57 | num_head_channels=num_head_channels, 58 | num_heads_upsample=num_heads_upsample, 59 | attention_resolutions=attention_resolutions, 60 | dropout=dropout, 61 | text_ctx=text_ctx, 62 | xf_width=xf_width, 63 | xf_layers=xf_layers, 64 | xf_heads=xf_heads, 65 | xf_final_ln=xf_final_ln, 66 | n_vocab=n_vocab, 67 | xf_padding=xf_padding, 68 | diffusion_steps=diffusion_steps, 69 | noise_schedule=noise_schedule, 70 | timestep_respacing=timestep_respacing, 71 | use_scale_shift_norm=use_scale_shift_norm, 72 | resblock_updown=resblock_updown, 73 | use_fp16=use_fp16, 74 | cache_text_emb=cache_text_emb, 75 | inpaint=inpaint, 76 | super_res=super_res, 77 | chinese=chinese, 78 | sketch=sketch, 79 | class_balanced=class_balanced, 80 | sketch_classes=sketch_classes, 81 | dtype=dtype 82 | ) 83 | 84 | 85 | def model_and_diffusion_upsample( 86 | image_size=256, 87 | num_channels=192, 88 | num_res_blocks=2, 89 | channel_mult=(1,1,2,2,4,4), 90 | num_heads=1, 91 | num_head_channels=64, 92 | num_heads_upsample=-1, 93 | attention_resolutions=tuple([32, 16, 8]), 94 | dropout=0.0, 95 | text_ctx=128, 96 | xf_width=512, 97 | xf_layers=16, 98 | xf_heads=8, 99 | xf_final_ln=True, 100 | n_vocab=50257, 101 | xf_padding=True, 102 | diffusion_steps=1000, 103 | noise_schedule="linear", 104 | timestep_respacing="fast27", 105 | use_scale_shift_norm=True, 106 | resblock_updown=True, 107 | use_fp16=True, 108 | cache_text_emb=False, 109 | inpaint=False, 110 | super_res=False, 111 | chinese=False, 112 | sketch=False, 113 | class_balanced=False, 114 | sketch_classes=0, 115 | dtype=mindspore.float32 116 | ): 117 | return dict( 118 | image_size=image_size, 119 | num_channels=num_channels, 120 | num_res_blocks=num_res_blocks, 121 | channel_mult=channel_mult, 122 | num_heads=num_heads, 123 | num_head_channels=num_head_channels, 124 | num_heads_upsample=num_heads_upsample, 125 | attention_resolutions=attention_resolutions, 126 | dropout=dropout, 127 | text_ctx=text_ctx, 128 | xf_width=xf_width, 129 | xf_layers=xf_layers, 130 | xf_heads=xf_heads, 131 | xf_final_ln=xf_final_ln, 132 | n_vocab=n_vocab, 133 | xf_padding=xf_padding, 134 | diffusion_steps=diffusion_steps, 135 | noise_schedule=noise_schedule, 136 | timestep_respacing=timestep_respacing, 137 | use_scale_shift_norm=use_scale_shift_norm, 138 | resblock_updown=resblock_updown, 139 | use_fp16=use_fp16, 140 | cache_text_emb=cache_text_emb, 141 | inpaint=inpaint, 142 | super_res=super_res, 143 | chinese=chinese, 144 | sketch=sketch, 145 | class_balanced=class_balanced, 146 | sketch_classes=sketch_classes, 147 | dtype=dtype 148 | ) 149 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/diffusion_creator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import mindspore 17 | from mindspore import load_checkpoint 18 | 19 | from model.glide_text2im.gaussian_computation import * 20 | from model.glide_text2im.model.gaussian_diffusion import GenerativePSampleDiffusionModel, DDimSampleDiffusionModel, PMeanVariance 21 | from model.glide_text2im.custom_types import LossType, ModelMeanType 22 | from model.glide_text2im.model.guider import SamplingWithGuidance 23 | from model.glide_text2im.model_creator import create_model, create_upsample_model 24 | 25 | 26 | 27 | def init_diffusion_model(options, guidance_scale, shape, ckpt_path=None): 28 | # init model 29 | model = create_model(**options) 30 | 31 | # init guidance 32 | pics_generated = int(shape[0] / 2) 33 | sampling_with_guidance = SamplingWithGuidance(model, guidance_scale, pics_generated) 34 | 35 | # init diffusion 36 | base_diffusion, _ = create_gaussian_diffusion( 37 | diffusion_steps=options["diffusion_steps"], noise_schedule=options["noise_schedule"], 38 | timestep_respacing=options["timestep_respacing"], class_balanced=options["class_balanced"], 39 | sketch_classes=options["sketch_classes"], guider_net=sampling_with_guidance, 40 | clip_denoised=True, denoised_net=None, dtype=options["dtype"], shape=shape 41 | ) 42 | diffusion_with_p_sample = GenerativePSampleDiffusionModel(base_diffusion, shape=shape, dtype=options["dtype"]) 43 | return diffusion_with_p_sample 44 | 45 | 46 | def init_super_res_model(options, shape, ckpt_path=None): 47 | # init model 48 | up_sample_model = create_upsample_model(**options) 49 | 50 | if ckpt_path: 51 | load_checkpoint(ckpt_path, up_sample_model) 52 | 53 | # init diffusion 54 | base_diffusion, _ = create_gaussian_diffusion( 55 | diffusion_steps=options["diffusion_steps"], noise_schedule=options["noise_schedule"], 56 | timestep_respacing=options["timestep_respacing"], class_balanced=options["class_balanced"], 57 | sketch_classes=options["sketch_classes"], guider_net=up_sample_model, 58 | clip_denoised=True, denoised_net=None, dtype=options["dtype"], shape=shape 59 | ) 60 | diffusion_with_ddim_sample = DDimSampleDiffusionModel(base_diffusion, shape=shape, dtype=options["dtype"]) 61 | return diffusion_with_ddim_sample 62 | 63 | 64 | def create_gaussian_diffusion( 65 | diffusion_steps, # 1000 66 | noise_schedule, 67 | timestep_respacing, # 200 68 | class_balanced, 69 | sketch_classes, 70 | guider_net=None, 71 | clip_denoised=True, 72 | denoised_net=None, 73 | shape=None, 74 | dtype=mindspore.float32 75 | ): 76 | betas = get_named_beta_schedule(noise_schedule, diffusion_steps) # 0-1之间,1000个数 77 | if not timestep_respacing: 78 | timestep_respacing = [diffusion_steps] 79 | if class_balanced: 80 | loss_type = LossType.BALANCED_MSE 81 | else: 82 | loss_type = LossType.MSE 83 | 84 | use_timesteps = space_timesteps(diffusion_steps, timestep_respacing) 85 | alphas_cumprod = alpha_calculator(betas) 86 | timestep_map, new_betas = space_diffusion_from_base(use_timesteps, alphas_cumprod) 87 | 88 | diffusion = PMeanVariance( 89 | guider_net=guider_net, clip_denoised=clip_denoised, denoised_net=denoised_net, timestep_map=timestep_map, 90 | betas=new_betas, model_mean_type=ModelMeanType.EPSILON, loss_type=loss_type, sketch_classes=sketch_classes, 91 | shape=shape, dtype=dtype 92 | ) 93 | return diffusion, betas 94 | 95 | 96 | def space_diffusion_from_base(use_timesteps, alphas_cumprod): 97 | timestep_map = [] 98 | 99 | last_alpha_cumprod = 1.0 100 | new_betas = [] 101 | for i, alpha_cumprod in enumerate(alphas_cumprod): 102 | if i in use_timesteps: 103 | new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) 104 | last_alpha_cumprod = alpha_cumprod 105 | timestep_map.append(i) 106 | return timestep_map, np.array(new_betas) 107 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/losses.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import mindspore as ms 17 | from mindspore import Tensor 18 | from mindspore import ops 19 | from mindspore import context 20 | import mindspore.numpy as np 21 | import numpy 22 | 23 | 24 | def normal_kl(mean1, logvar1, mean2, logvar2): 25 | """ 26 | Compute the KL divergence between two gaussians. 27 | 28 | Shapes are automatically broadcasted, so batches can be compared to 29 | scalars, among other use cases. 30 | """ 31 | 32 | exp = ops.Exp() 33 | prints = ops.Print() 34 | pow = ops.Pow() 35 | return 0.5 * ( 36 | -1.0 37 | + logvar2 38 | - logvar1 39 | + exp(logvar1 - logvar2) 40 | + (pow((mean1 - mean2), 2) * exp(-logvar2)) 41 | ) 42 | 43 | 44 | def approx_standard_normal_cdf(x): 45 | """ 46 | A fast approximation of the cumulative distribution function of the 47 | standard normal. 48 | """ 49 | tanh = ops.Tanh() 50 | pow = ops.Pow() 51 | return 0.5 * (1.0 + tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * pow(x, 3)))) 52 | 53 | 54 | def discretized_gaussian_log_likelihood(x, *, means, log_scales): 55 | """ 56 | Compute the log-likelihood of a Gaussian distribution discretizing to a 57 | given image. 58 | 59 | :param x: the target images. It is assumed that this was uint8 values, 60 | rescaled to the range [-1, 1]. 61 | :param means: the Gaussian mean Tensor. 62 | :param log_scales: the Gaussian log stddev Tensor. 63 | :return: a tensor like x of log probabilities (in nats). 64 | """ 65 | exp = ops.Exp() 66 | log = ops.Log() 67 | assert x.shape == means.shape == log_scales.shape 68 | centered_x = x - means 69 | inv_stdv = exp(-log_scales) 70 | plus_in = inv_stdv * (centered_x + 1.0 / 255.0) 71 | cdf_plus = approx_standard_normal_cdf(plus_in) 72 | min_in = inv_stdv * (centered_x - 1.0 / 255.0) 73 | cdf_min = approx_standard_normal_cdf(min_in) 74 | log_cdf_plus = log(ops.clip_by_value(cdf_plus, clip_value_min=1e-12, clip_value_max=1e10)) 75 | log_one_minus_cdf_min = log(ops.clip_by_value((1.0 - cdf_min), clip_value_min=1e-12, 76 | clip_value_max=1e10)) 77 | cdf_delta = cdf_plus - cdf_min 78 | log_probs = np.where( 79 | x < -0.999, 80 | log_cdf_plus, 81 | np.where(x > 0.999, log_one_minus_cdf_min, log(ops.clip_by_value( 82 | cdf_delta, clip_value_min=1e-12, clip_value_max=1e10))), 83 | ) 84 | assert log_probs.shape == x.shape 85 | return log_probs 86 | 87 | if __name__ == "__main__": 88 | x = Tensor(numpy.random.standard_normal(10,).astype(numpy.float32)) 89 | y = Tensor(numpy.random.standard_normal(10,).astype(numpy.float32)) 90 | m1 = Tensor(numpy.random.standard_normal(10,).astype(numpy.float32)) 91 | m2 = Tensor(numpy.random.standard_normal(10,).astype(numpy.float32)) -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/main_funcs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import mindspore 17 | from tqdm.auto import tqdm 18 | from random import choice 19 | 20 | 21 | def gaussian_p_sample_loop(diffusion_model, token, mask, shape, num_timesteps, tokenizer, text_ctx, 22 | noise=None, progress=False, dtype=mindspore.float32, vocab_len=50001): 23 | # init original image(pure noise) 24 | if noise is not None: 25 | img = noise 26 | else: 27 | img = mindspore.ops.StandardNormal()(shape) 28 | img = mindspore.ops.Cast()(img, dtype) 29 | indices = list(range(num_timesteps))[::-1] 30 | 31 | # visualized progress bar 32 | if progress: 33 | indices = tqdm(indices) 34 | 35 | # recursively de-noising on img 36 | for i in indices: 37 | random_token_tensor = mindspore.numpy.randint(1, vocab_len-1, (text_ctx,), dtype=mindspore.int32) 38 | random_mask_tensor = mindspore.numpy.ones((text_ctx,), mindspore.int32) 39 | i_tensor = mindspore.Tensor([i], dtype=mindspore.int32) 40 | sample, _ = diffusion_model(x=img, timesteps=i_tensor, token=token, mask=mask, 41 | random_token=random_token_tensor, random_mask=random_mask_tensor) 42 | img = sample 43 | 44 | return img 45 | 46 | 47 | def ddim_sample_loop(super_res_model, up_shape, samples, token, mask, num_timesteps, noise=None, progress=False, 48 | dtype=mindspore.float32): 49 | # init original image(pure noise) 50 | if noise is not None: 51 | img = noise 52 | else: 53 | upsample_temp = 0.997 54 | img = mindspore.ops.StandardNormal()(up_shape) 55 | img = mindspore.ops.Mul()(img, upsample_temp) 56 | img = mindspore.ops.Cast()(img, dtype) 57 | 58 | indices = list(range(num_timesteps))[::-1] 59 | 60 | # visualized progress bar 61 | if progress: 62 | indices = tqdm(indices) 63 | 64 | for i in indices: 65 | i_tensor = mindspore.Tensor(input_data=[i], dtype=mindspore.int32) 66 | sample, _ = super_res_model(x=img, timesteps=i_tensor, token=token, mask=mask, samples=samples) 67 | img = sample 68 | 69 | return img 70 | 71 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/Taichu-GLIDE/model/glide_text2im/model/__init__.py -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/model/guider.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import mindspore.nn as nn 17 | import mindspore 18 | 19 | 20 | class CombinePrompt(nn.Cell): 21 | def __init__(self, pics_generated): 22 | super(CombinePrompt, self).__init__() 23 | # model attributes 24 | self.pics_generated = pics_generated 25 | 26 | # operations 27 | self.slice = mindspore.ops.Slice() 28 | self.concat = mindspore.ops.Concat(axis=0) 29 | self.broadcast_to = mindspore.ops.BroadcastTo((pics_generated, 128)) 30 | self.cast = mindspore.ops.Cast() 31 | 32 | ''' 33 | x_t: tensor 34 | kwargs: dict, {tokens : num_of_pics*2 x 128 tensor, mask : num_of_pics*2 x 128 tensor} 35 | ''' 36 | def construct(self, x_t, in_token, in_mask, random_token, random_mask): 37 | # computes 38 | first_half_tokens = self.slice(in_token, (0, 0), (self.pics_generated, in_token.shape[1])) 39 | first_half_mask = self.slice(in_mask, (0, 0), (self.pics_generated, in_mask.shape[1])) 40 | 41 | _, channels, img_h, img_w = x_t.shape 42 | half = self.slice(x_t, (0, 0, 0, 0), (self.pics_generated, channels, img_h, img_w)) 43 | combined = self.concat((half, half)) 44 | 45 | last_half_tokens = self.broadcast_to(random_token) 46 | last_half_mask = self.broadcast_to(random_mask) 47 | tokens = self.concat((first_half_tokens, last_half_tokens)) 48 | mask = self.concat((first_half_mask, last_half_mask)) 49 | 50 | return combined, tokens, mask 51 | 52 | 53 | class Guider(nn.Cell): 54 | def __init__(self, guidance_scale): 55 | super(Guider, self).__init__() 56 | # model attributes 57 | self.guidance_scale = guidance_scale 58 | 59 | # operations 60 | self.slice = mindspore.ops.Slice() 61 | self.concat = mindspore.ops.Concat(axis=0) 62 | self.concat_at_1 = mindspore.ops.Concat(axis=1) 63 | self.split = mindspore.ops.Split(axis=0, output_num=2) 64 | self.add = mindspore.ops.Add() 65 | self.mul = mindspore.ops.Mul() 66 | self.neg = mindspore.ops.Neg() 67 | 68 | ''' 69 | x_t: tensor 70 | ts: tensor 71 | kwargs: dict, {tokens : num_of_pics*2 x 128 tensor, mask : num_of_pics*2 x 128 tensor} 72 | ''' 73 | def construct(self, model_out): 74 | modelout_shape = model_out.shape 75 | eps = self.slice(model_out, (0, 0, 0, 0), (modelout_shape[0], 3, modelout_shape[2], modelout_shape[3])) 76 | rest = self.slice(model_out, (0, 3, 0, 0), (modelout_shape[0], 3, modelout_shape[2], modelout_shape[3])) 77 | 78 | cond_eps, uncond_eps = self.split(eps) 79 | 80 | diff_eps = self.add(cond_eps, self.neg(uncond_eps)) 81 | scaled_diff_epq = self.mul(self.guidance_scale, diff_eps) 82 | half_eps = self.add(uncond_eps, scaled_diff_epq) 83 | eps = self.concat((half_eps, half_eps)) 84 | out = self.concat_at_1((eps, rest)) 85 | 86 | return out 87 | 88 | 89 | class SamplingWithGuidance(nn.Cell): 90 | def __init__(self, model, guidance_scale, num_of_pics_generated): 91 | super(SamplingWithGuidance, self).__init__() 92 | self.combine_prompt = CombinePrompt(num_of_pics_generated) 93 | self.model = model 94 | self.guider = Guider(guidance_scale) 95 | self.broadcast_to = mindspore.ops.BroadcastTo((num_of_pics_generated * 2,)) 96 | self.concat = mindspore.ops.Concat(axis=1) 97 | 98 | def construct(self, x_t, timesteps, in_token, in_mask, random_token, random_mask): 99 | combined, tokens, mask = self.combine_prompt(x_t, in_token, in_mask, random_token, random_mask) 100 | timesteps = self.broadcast_to(timesteps) 101 | model_out = self.model(combined, timesteps, tokens, mask) 102 | out = self.guider(model_out) 103 | return out 104 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/model/srgan.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import math 17 | import mindspore.nn as nn 18 | import mindspore.ops as ops 19 | from mindspore.common import initializer as init 20 | 21 | 22 | def init_weights(net, init_type='normal', init_gain=0.1): 23 | """ 24 | Initialize network weights 25 | """ 26 | for _, cell in net.cells_and_names(): 27 | if isinstance(cell, (nn.Conv2d, nn.Conv2dTranspose)): 28 | if init_type == 'normal': 29 | cell.weight.set_data(init.initializer(init.Normal(init_gain), cell.weight.shape)) 30 | elif init_type == 'xavier': 31 | cell.weight.set_data(init.initializer(init.XavierUniform(init_gain), cell.weight.shape)) 32 | elif init_type == 'constant': 33 | cell.weight.set_data(init.initializer(0.001, cell.weight.shape)) 34 | else: 35 | raise NotImplementedError('initialization method [%s] is not implemented' % init_type) 36 | elif isinstance(cell, nn.BatchNorm2d): 37 | cell.gamma.set_data(init.initializer('ones', cell.gamma.shape)) 38 | cell.beta.set_data(init.initializer('zeros', cell.beta.shape)) 39 | 40 | 41 | class ResidualBlock(nn.Cell): 42 | """Structure of ResidualBlock""" 43 | def __init__(self, channels): 44 | super(ResidualBlock, self).__init__() 45 | self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, stride=1, padding=1, has_bias=True, pad_mode='pad') 46 | self.bn1 = nn.BatchNorm2d(channels) 47 | self.prelu = nn.PReLU(channels) 48 | self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, stride=1, padding=1, has_bias=True, pad_mode='pad') 49 | self.bn2 = nn.BatchNorm2d(channels) 50 | 51 | def construct(self, x): 52 | out = self.conv1(x) 53 | out = self.bn1(out) 54 | out = self.prelu(out) 55 | out = self.conv2(out) 56 | out = self.bn2(out) 57 | return out + x 58 | 59 | 60 | class SubpixelConvolutionLayer(nn.Cell): 61 | """Structure of SubpixelConvolutionLayer""" 62 | def __init__(self, channels): 63 | super(SubpixelConvolutionLayer, self).__init__() 64 | self.conv = nn.Conv2d(channels, channels*4, kernel_size=3, stride=1, padding=1, has_bias=True, pad_mode='pad') 65 | self.pixel_shuffle = ops.DepthToSpace(2) 66 | self.prelu = nn.PReLU(channels) 67 | 68 | def construct(self, x): 69 | out = self.conv(x) 70 | out = self.pixel_shuffle(out) 71 | out = self.prelu(out) 72 | return out 73 | 74 | 75 | class Generator(nn.Cell): 76 | """Structure of Generator""" 77 | def __init__(self, upscale_factor): 78 | 79 | super(Generator, self).__init__() 80 | # Calculating the number of subpixel convolution layers. 81 | num_subpixel_convolution_layers = int(math.log(upscale_factor, 2)) 82 | # First layer. 83 | self.conv1 = nn.SequentialCell( 84 | nn.Conv2d(3, 64, kernel_size=9, stride=1, padding=4, has_bias=True, pad_mode='pad'), 85 | nn.PReLU(channel=64)) 86 | 87 | # 16 Residual blocks 88 | trunk = [] 89 | for _ in range(16): 90 | trunk.append(ResidualBlock(64)) 91 | self.trunk = nn.SequentialCell(*trunk) 92 | 93 | # Second conv layer post residual blocks. 94 | self.conv2 = nn.SequentialCell( 95 | nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, has_bias=True, pad_mode='pad'), 96 | nn.PReLU(channel=64) 97 | ) 98 | 99 | # 2 Sub-pixel convolution layers. 100 | subpixel_conv_layers = [] 101 | for _ in range(num_subpixel_convolution_layers): 102 | subpixel_conv_layers.append(SubpixelConvolutionLayer(64)) 103 | self.subpixel_conv = nn.SequentialCell(*subpixel_conv_layers) 104 | 105 | # Final output layer. 106 | self.conv3 = nn.Conv2d(64, 3, kernel_size=9, stride=1, padding=4, has_bias=True, pad_mode='pad') 107 | self.tanh = nn.Tanh() 108 | 109 | def construct(self, x): 110 | conv1 = self.conv1(x) 111 | trunk = self.trunk(conv1) 112 | conv2 = self.conv2(trunk) 113 | out = conv1+conv2 114 | out = self.subpixel_conv(out) 115 | out = self.conv3(out) 116 | out = self.tanh(out) 117 | return out 118 | 119 | 120 | def get_generator(upscale_factor, init_gain): 121 | net = Generator(upscale_factor) 122 | init_weights(net, 'normal', init_gain) 123 | return net 124 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/model/srgan_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | from mindspore import ops 17 | from PIL import Image 18 | import numpy as np 19 | import mindspore 20 | from mindspore import load_checkpoint, load_param_into_net, Tensor 21 | 22 | from .srgan import Generator 23 | 24 | 25 | def get_img(batch: mindspore.Tensor): 26 | batch_plus = mindspore.ops.Add()(batch, 1) 27 | scaled = mindspore.ops.Mul()(batch_plus, 127.5) 28 | rounded_scaled = mindspore.ops.Rint()(scaled) 29 | clipped_scaled = mindspore.ops.clip_by_value(rounded_scaled, mindspore.Tensor(0), mindspore.Tensor(255)) 30 | clipped_scaled = clipped_scaled.transpose((2, 0, 3, 1)) 31 | clipped_scaled = mindspore.ops.Cast()(clipped_scaled, mindspore.uint8) 32 | reshaped = clipped_scaled.reshape(([batch.shape[2], -1, 3])) 33 | return reshaped 34 | 35 | 36 | class SRGAN(): 37 | def __init__(self, upscale_factor, ckpt_path): 38 | self.net = Generator(upscale_factor) 39 | params = load_checkpoint(ckpt_path) 40 | load_param_into_net(self.net, params) 41 | self.reduce_dims = ops.ReduceSum(keep_dims=False) 42 | self.expand_dims = ops.ExpandDims() 43 | 44 | # SR from Tensor 45 | def sr_handle(self, lr): 46 | output = self.net(lr) 47 | return output 48 | 49 | # SR from image 50 | def sr_image(self, lr_image, hr_image): 51 | lr = np.array(Image.open(lr_image).convert("RGB")) 52 | lr = (lr / 127.5) - 1.0 53 | lr = lr.transpose(2, 0, 1).astype(np.float32) 54 | lr = np.expand_dims(lr, axis=0) 55 | output = self.sr_handle(Tensor(lr)) 56 | output = output.asnumpy() 57 | output = np.squeeze(output, axis=0) 58 | output = np.clip(output, -1.0, 1.0) 59 | output = (output + 1.0) / 2.0 60 | output = output.transpose(1, 2, 0) 61 | Image.fromarray((output * 255.0).astype(np.uint8)).save(hr_image, quality=100) 62 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/model/xf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import math 17 | 18 | import mindspore as ms 19 | import mindspore.nn as nn 20 | import mindspore.ops 21 | import mindspore.ops as ops 22 | 23 | from model.glide_text2im.model.simple_nn import Linear 24 | 25 | 26 | class LayerNorm(nn.LayerNorm): 27 | """ 28 | Implementation that supports fp16 inputs but fp32 gains/biases. 29 | """ 30 | def construct(self, x: ms.Tensor): 31 | y = super().construct(ops.Cast()(x, ms.float32)) 32 | y = ops.Cast()(y, x.dtype) 33 | return y 34 | 35 | 36 | class MultiheadAttention(nn.Cell): 37 | def __init__(self, n_ctx, width, heads, dtype): 38 | super().__init__() 39 | self.n_ctx = n_ctx 40 | self.width = width 41 | self.heads = heads 42 | self.c_qkv = Linear(width, width * 3, dtype=dtype) 43 | self.c_proj = Linear(width, width, dtype=dtype) 44 | self.attention = QKVMultiheadAttention(width, heads, n_ctx, dtype) 45 | 46 | def construct(self, x): 47 | x = self.c_qkv(x) 48 | x = self.attention(x) 49 | x = self.c_proj(x) 50 | return x 51 | 52 | 53 | class MLP(nn.Cell): 54 | def __init__(self, width, dtype): 55 | super().__init__() 56 | self.width = width 57 | self.c_fc = Linear(width, width * 4, dtype=dtype) 58 | self.c_proj = Linear(width * 4, width, dtype=dtype) 59 | self.gelu = nn.GELU() 60 | 61 | def construct(self, x): 62 | return self.c_proj(self.gelu(self.c_fc(x))) 63 | 64 | 65 | class QKVMultiheadAttention(nn.Cell): 66 | def __init__(self, width: int, n_heads: int, n_ctx: int, dtype: mindspore.dtype): 67 | super().__init__() 68 | self.n_heads = n_heads 69 | self.n_ctx = n_ctx 70 | self.dtype = dtype 71 | 72 | self.concat = ops.Concat() 73 | self.sqrt = ops.Sqrt() 74 | self.softmax = nn.Softmax() 75 | self.print = ops.Print() 76 | self.split = ops.Split(axis=-1, output_num=3) 77 | self.cast = ops.Cast() 78 | self.transpose = ops.Transpose() 79 | 80 | self.scale = 1 / math.sqrt(math.sqrt(width * 3 // self.n_heads // 3)) 81 | 82 | def construct(self, qkv): 83 | bs, _, _ = qkv.shape 84 | qkv = qkv.view(bs, self.n_ctx, self.n_heads, -1) 85 | q, k, v = self.split(qkv) 86 | q = q * self.scale 87 | k = k * self.scale 88 | q = self.transpose(q, (0, 2, 1, 3)) 89 | k = self.transpose(k, (0, 2, 3, 1)) 90 | weight = ops.matmul(q, k) 91 | wdtype = weight.dtype 92 | weight = self.cast(self.softmax(self.cast(weight, ms.float32)), wdtype) 93 | weight = self.transpose(weight, (0, 1, 2, 3)) 94 | v = self.transpose(v, (0, 2, 1, 3)) 95 | a = ops.matmul(weight, v) 96 | a = self.transpose(a, (0, 2, 1, 3)) 97 | return a.reshape(bs, self.n_ctx, -1) 98 | 99 | 100 | class ResidualAttentionBlock(nn.Cell): 101 | def __init__( 102 | self, 103 | n_ctx: int, 104 | width: int, 105 | heads: int, 106 | dtype: mindspore.dtype 107 | ): 108 | super().__init__() 109 | 110 | self.attn = MultiheadAttention( 111 | n_ctx, 112 | width, 113 | heads, 114 | dtype 115 | ) 116 | self.ln_1 = LayerNorm([width]) 117 | self.mlp = MLP(width, dtype) 118 | self.ln_2 = LayerNorm([width]) 119 | 120 | def construct(self, x: ms.Tensor): 121 | x = x + self.attn(self.ln_1(x)) 122 | x = x + self.mlp(self.ln_2(x)) 123 | return x 124 | 125 | 126 | class Transformer(nn.Cell): 127 | def __init__( 128 | self, 129 | n_ctx: int, 130 | width: int, 131 | layers: int, 132 | heads: int, 133 | dtype: mindspore.dtype, 134 | ): 135 | super().__init__() 136 | self.n_ctx = n_ctx 137 | self.width = width 138 | self.layers = layers 139 | self.resblocks = nn.CellList( 140 | [ 141 | ResidualAttentionBlock( 142 | n_ctx, 143 | width, 144 | heads, 145 | dtype 146 | ) 147 | for _ in range(layers) 148 | ] 149 | ) 150 | 151 | def construct(self, x: ms.Tensor): 152 | for block in self.resblocks: 153 | x = block(x) 154 | return x 155 | 156 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/model_creation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | from model.glide_text2im.model.text2im_model import Text2ImUNet, SuperResText2ImUNet 17 | from model.glide_text2im.tokenizer.chinese_tokenizer import from_pretrained 18 | 19 | def model_and_diffusion_defaults(): 20 | return dict( 21 | image_size=64, 22 | num_channels=192, 23 | num_res_blocks=3, 24 | channel_mult="", 25 | num_heads=1, 26 | num_head_channels=64, 27 | num_heads_upsample=-1, 28 | attention_resolutions="32,16,8", 29 | dropout=0.0, 30 | text_ctx=128, 31 | xf_width=512, 32 | xf_layers=16, 33 | xf_heads=8, 34 | xf_final_ln=True, 35 | xf_padding=True, 36 | diffusion_steps=1000, 37 | noise_schedule="squaredcos_cap_v2", 38 | timestep_respacing="", 39 | use_scale_shift_norm=True, 40 | resblock_updown=True, 41 | use_fp16=True, 42 | cache_text_emb=False, 43 | inpaint=False, 44 | super_res=False, 45 | chinese=False, 46 | sketch=False, 47 | class_balanced=False, 48 | sketch_classes=0 49 | ) 50 | 51 | def create_model_and_diffusion(options): 52 | #print(options) 53 | return create_model(**options) 54 | 55 | def create_model( 56 | image_size, 57 | num_channels, 58 | num_res_blocks, 59 | channel_mult, 60 | num_heads, 61 | num_head_channels, 62 | num_heads_upsample, 63 | attention_resolutions, 64 | dropout, 65 | text_ctx, 66 | xf_width, 67 | xf_layers, 68 | xf_heads, 69 | xf_final_ln, 70 | n_vocab, 71 | xf_padding, 72 | diffusion_steps, 73 | noise_schedule, 74 | timestep_respacing, 75 | use_scale_shift_norm, 76 | resblock_updown, 77 | use_fp16, 78 | cache_text_emb, 79 | inpaint, 80 | super_res, 81 | chinese, 82 | sketch, 83 | class_balanced, 84 | sketch_classes, 85 | dtype): 86 | net = Text2ImUNet( 87 | text_ctx=text_ctx, 88 | xf_width=xf_width, 89 | xf_layers=xf_layers, 90 | xf_heads=xf_heads, 91 | xf_final_ln=xf_final_ln, 92 | n_vocab=n_vocab, 93 | xf_padding=xf_padding, 94 | in_channels=3, 95 | model_channels=num_channels, 96 | out_channels=6, 97 | num_res_blocks=num_res_blocks, 98 | attention_resolutions=attention_resolutions, 99 | dropout=dropout, 100 | channel_mult=channel_mult, 101 | use_fp16=use_fp16, 102 | num_heads=num_heads, 103 | num_head_channels=num_head_channels, 104 | num_heads_upsample=num_heads_upsample, 105 | use_scale_shift_norm=use_scale_shift_norm, 106 | resblock_updown=resblock_updown, 107 | cache_text_emb=cache_text_emb, 108 | dtype=dtype 109 | ) 110 | return net 111 | 112 | 113 | def create_upsample_model( 114 | image_size, 115 | num_channels, 116 | num_res_blocks, 117 | channel_mult, 118 | num_heads, 119 | num_head_channels, 120 | num_heads_upsample, 121 | attention_resolutions, 122 | dropout, 123 | text_ctx, 124 | xf_width, 125 | xf_layers, 126 | xf_heads, 127 | xf_final_ln, 128 | n_vocab, 129 | xf_padding, 130 | diffusion_steps, 131 | noise_schedule, 132 | timestep_respacing, 133 | use_scale_shift_norm, 134 | resblock_updown, 135 | use_fp16, 136 | cache_text_emb, 137 | inpaint, 138 | super_res, 139 | chinese, 140 | sketch, 141 | class_balanced, 142 | sketch_classes, 143 | dtype): 144 | net = SuperResText2ImUNet( 145 | image_size = image_size, 146 | text_ctx=text_ctx, 147 | xf_width=xf_width, 148 | xf_layers=xf_layers, 149 | xf_heads=xf_heads, 150 | xf_final_ln=xf_final_ln, 151 | n_vocab=n_vocab, 152 | xf_padding=xf_padding, 153 | in_channels=6, 154 | model_channels=num_channels, 155 | out_channels=6, 156 | num_res_blocks=num_res_blocks, 157 | attention_resolutions=attention_resolutions, 158 | dropout=dropout, 159 | channel_mult=channel_mult, 160 | use_fp16=use_fp16, 161 | num_heads=num_heads, 162 | num_head_channels=num_head_channels, 163 | num_heads_upsample=num_heads_upsample, 164 | use_scale_shift_norm=use_scale_shift_norm, 165 | resblock_updown=resblock_updown, 166 | cache_text_emb=cache_text_emb, 167 | dtype=dtype 168 | ) 169 | return net 170 | 171 | def add_dict_to_argparser(parser, default_dict): 172 | for k, v in default_dict.items(): 173 | v_type = type(v) 174 | if v is None: 175 | v_type = str 176 | elif isinstance(v, bool): 177 | v_type = str2bool 178 | parser.add_argument(f"--{k}", default=v, type=v_type) 179 | 180 | 181 | def args_to_dict(args, keys): 182 | return {k: getattr(args, k) for k in keys} 183 | 184 | def str2bool(v): 185 | if isinstance(v, bool): 186 | return v 187 | if v.lower() in ("yes", "true", "t", "y", "1"): 188 | return True 189 | elif v.lower() in ("no", "false", "f", "n", "0"): 190 | return False 191 | else: 192 | raise argparse.ArgumentTypeError("boolean value expected") 193 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/model_creator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | from model.glide_text2im.model.text2im_model import Text2ImUNet, SuperResText2ImUNet 17 | 18 | 19 | def create_model( 20 | image_size, 21 | num_channels, 22 | num_res_blocks, 23 | channel_mult, 24 | num_heads, 25 | num_head_channels, 26 | num_heads_upsample, 27 | attention_resolutions, 28 | dropout, 29 | text_ctx, 30 | xf_width, 31 | xf_layers, 32 | xf_heads, 33 | xf_final_ln, 34 | n_vocab, 35 | xf_padding, 36 | diffusion_steps, 37 | noise_schedule, 38 | timestep_respacing, 39 | use_scale_shift_norm, 40 | resblock_updown, 41 | use_fp16, 42 | cache_text_emb, 43 | inpaint, 44 | super_res, 45 | chinese, 46 | sketch, 47 | class_balanced, 48 | sketch_classes, 49 | dtype): 50 | print("origin t2i net") 51 | net = Text2ImUNet( 52 | text_ctx=text_ctx, 53 | xf_width=xf_width, 54 | xf_layers=xf_layers, 55 | xf_heads=xf_heads, 56 | xf_final_ln=xf_final_ln, 57 | n_vocab=n_vocab, 58 | xf_padding=xf_padding, 59 | in_channels=3, 60 | model_channels=num_channels, 61 | out_channels=6, 62 | num_res_blocks=num_res_blocks, 63 | attention_resolutions=attention_resolutions, 64 | dropout=dropout, 65 | channel_mult=channel_mult, 66 | use_fp16=use_fp16, 67 | num_heads=num_heads, 68 | num_head_channels=num_head_channels, 69 | num_heads_upsample=num_heads_upsample, 70 | use_scale_shift_norm=use_scale_shift_norm, 71 | resblock_updown=resblock_updown, 72 | cache_text_emb=cache_text_emb, 73 | dtype=dtype 74 | ) 75 | return net 76 | 77 | 78 | def create_upsample_model( 79 | image_size, 80 | num_channels, 81 | num_res_blocks, 82 | channel_mult, 83 | num_heads, 84 | num_head_channels, 85 | num_heads_upsample, 86 | attention_resolutions, 87 | dropout, 88 | text_ctx, 89 | xf_width, 90 | xf_layers, 91 | xf_heads, 92 | xf_final_ln, 93 | n_vocab, 94 | xf_padding, 95 | diffusion_steps, 96 | noise_schedule, 97 | timestep_respacing, 98 | use_scale_shift_norm, 99 | resblock_updown, 100 | use_fp16, 101 | cache_text_emb, 102 | inpaint, 103 | super_res, 104 | chinese, 105 | sketch, 106 | class_balanced, 107 | sketch_classes, 108 | dtype): 109 | print("super res net") 110 | net = SuperResText2ImUNet( 111 | image_size=image_size, 112 | text_ctx=text_ctx, 113 | xf_width=xf_width, 114 | xf_layers=xf_layers, 115 | xf_heads=xf_heads, 116 | xf_final_ln=xf_final_ln, 117 | n_vocab=n_vocab, 118 | xf_padding=xf_padding, 119 | in_channels=6, 120 | model_channels=num_channels, 121 | out_channels=6, 122 | num_res_blocks=num_res_blocks, 123 | attention_resolutions=attention_resolutions, 124 | dropout=dropout, 125 | channel_mult=channel_mult, 126 | use_fp16=use_fp16, 127 | num_heads=num_heads, 128 | num_head_channels=num_head_channels, 129 | num_heads_upsample=num_heads_upsample, 130 | use_scale_shift_norm=use_scale_shift_norm, 131 | resblock_updown=resblock_updown, 132 | cache_text_emb=cache_text_emb, 133 | dtype=dtype 134 | ) 135 | return net 136 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/Taichu-GLIDE/model/glide_text2im/tokenizer/__init__.py -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/tokenizer/caption_to_tokens.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import mindspore 17 | 18 | import model.glide_text2im.train.image_datasets as data_reader 19 | 20 | 21 | def convert_input_to_token_gen(input_line, pics_generated, text_ctx, tokenizer): 22 | tokens, mask = encode_and_pad(input_line, text_ctx, tokenizer) 23 | uncond_tokens, uncond_mask = tokenizer.padded_tokens_and_mask([], text_ctx) 24 | return ( 25 | mindspore.Tensor([tokens] * pics_generated + [uncond_tokens] * pics_generated, dtype=mindspore.int32), 26 | mindspore.Tensor([mask] * pics_generated + [uncond_mask] * pics_generated, dtype=mindspore.int32) 27 | ) 28 | 29 | 30 | def convert_input_to_token_super_res(input_line, pics_generated, text_ctx, tokenizer): 31 | tokens, mask = encode_and_pad(input_line, text_ctx, tokenizer) 32 | tokens = mindspore.Tensor([tokens] * pics_generated, dtype=mindspore.int32) 33 | mask = mindspore.Tensor([mask] * pics_generated, dtype=mindspore.int32) 34 | return tokens, mask 35 | 36 | 37 | def encode_and_pad(input_line, text_ctx, tokenizer): 38 | # Pack the tokens together into model kwargs. 39 | tokens = tokenizer.encode(input_line) 40 | tokens, mask = tokenizer.padded_tokens_and_mask(tokens, text_ctx) # text_ctx 128 41 | return tokens, mask 42 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/tokenizer/chinese_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import json 17 | import os 18 | import sentencepiece as spm 19 | from typing import List, Tuple 20 | 21 | 22 | def get_pairs(word): 23 | pairs = set() 24 | prev_char = word[0] 25 | for char in word[1:]: 26 | pairs.add((prev_char, char)) 27 | prev_char = char 28 | return pairs 29 | 30 | 31 | class Encoder: 32 | def __init__(self, encoder, bpe_merges): 33 | self.encoder = encoder 34 | self.decoder = {v: k for k, v in self.encoder.items()} 35 | self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) 36 | self.cache = {} 37 | self.max_len = 0 38 | 39 | def bpe(self, token): 40 | if token in self.cache: 41 | return self.cache[token] 42 | word = tuple(token) 43 | pairs = get_pairs(word) 44 | if not pairs: 45 | return token 46 | 47 | while True: 48 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) 49 | if bigram not in self.bpe_ranks: 50 | break 51 | first, second = bigram 52 | new_word = [] 53 | i = 0 54 | while i < len(word): 55 | try: 56 | j = word.index(first, i) 57 | new_word.extend(word[i:j]) 58 | i = j 59 | except: 60 | new_word.extend(word[i:]) 61 | break 62 | 63 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second: 64 | new_word.append(first + second) 65 | i += 2 66 | else: 67 | new_word.append(word[i]) 68 | i += 1 69 | new_word = tuple(new_word) 70 | word = new_word 71 | if len(word) == 1: 72 | break 73 | else: 74 | pairs = get_pairs(word) 75 | word = ' '.join(word) 76 | self.cache[token] = word 77 | return word 78 | 79 | def encode(self, text): 80 | return [self.encoder.get(token, 1) for token in self.tokenize(text)] 81 | 82 | def decode(self, tokens): 83 | text = ''.join([self.decoder[token] for token in tokens]) 84 | return text 85 | 86 | def tokenize(self, text): 87 | bpe_tokens = [] 88 | bpe_tokens.extend(bpe_token for bpe_token in self.bpe(text).split(' ')) 89 | return bpe_tokens 90 | 91 | def convert_tokens_to_ids(self, tokens): 92 | return [self.encoder.get(token, 1) for token in tokens] 93 | 94 | 95 | class Encoder_SP: 96 | def __init__(self, model_path): 97 | self.sp = spm.SentencePieceProcessor() 98 | self.sp.Load(model_path) 99 | self.num_tokens = self.sp.vocab_size() 100 | 101 | @property 102 | def n_vocab(self) -> int: 103 | # +1 for end token 104 | return self.num_tokens + 1 105 | 106 | @property 107 | def end_token(self) -> int: 108 | return self.n_vocab - 1 109 | 110 | def padded_tokens_and_mask( 111 | self, tokens: List[int], text_ctx: int 112 | ) -> Tuple[List[int], List[bool]]: 113 | tokens = tokens[:text_ctx] 114 | padding = text_ctx - len(tokens) 115 | padded_tokens = tokens + [self.end_token] * padding 116 | mask = [True] * len(tokens) + [False] * padding 117 | return padded_tokens, mask 118 | 119 | def encode(self, text): 120 | """ 121 | text="...." 122 | """ 123 | return self.sp.EncodeAsIds(text) 124 | 125 | def decode(self, tokens): 126 | """ 127 | tokens=[x1,x2,...] 128 | """ 129 | text = [int(token) for token in tokens if int(token) != self.end_token] 130 | return self.sp.DecodeIds(text) 131 | 132 | def tokenize(self, text): 133 | return self.sp.EncodeAsPieces(text) 134 | 135 | def convert_tokens_to_ids(self, tokens): 136 | return [self.sp.PieceToId(token) for token in tokens] 137 | 138 | def convert_token_to_id(self, token): 139 | return self.sp.PieceToId(token) 140 | 141 | def convert_id_to_token(self, idx): 142 | return self.sp.IdToPiece(idx) 143 | 144 | 145 | def get_encoder(encoder_file, bpe_file): 146 | filepath, filename = os.path.split(encoder_file) 147 | shotname, extension = os.path.splitext(filename) 148 | 149 | if (".model" == extension) and (bpe_file == ""): 150 | return Encoder_SP(encoder_file) 151 | else: 152 | with open(encoder_file, 'r', encoding="utf-8") as f: 153 | encoder = json.load(f) 154 | with open(bpe_file, 'r', encoding="utf-8") as f: 155 | bpe_data = f.read() 156 | bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]] 157 | return Encoder( 158 | encoder=encoder, 159 | bpe_merges=bpe_merges, 160 | ) 161 | 162 | 163 | def from_pretrained(file): 164 | return get_encoder(file, "") -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/Taichu-GLIDE/model/glide_text2im/train/__init__.py -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/train/build_optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ 16 | build optimizer for ms 17 | for params containing the words 'layernorm' and not containing 'bias', we choose the adam. 18 | for the other params, they are optimized by adam. 19 | """ 20 | from mindspore.nn.optim.adam import Adam, AdamWeightDecay 21 | 22 | 23 | def build_optimizer(model, optim, betas, lr): 24 | """ 25 | 26 | :param model: 27 | :param opts: 28 | :param lr: 29 | :return: optimizer 30 | """ 31 | 32 | decay_filter = lambda x: 'layernorm' not in x.name.lower() and "bias" not in x.name.lower() 33 | param_optimizer = model.trainable_params() 34 | decay_params = list(filter(decay_filter, param_optimizer)) 35 | other_params = list(filter(lambda x: not decay_filter(x), param_optimizer)) 36 | group_params = [{ 37 | 'params': decay_params, 38 | 'weight_decay': 1e-2 39 | }, { 40 | 'params': other_params, 41 | 'weight_decay': 0.0 42 | }, { 43 | 'order_params': param_optimizer 44 | }] 45 | # currently Adam only 46 | if optim == 'adam': 47 | OptimCls = Adam 48 | elif optim == 'adamw': 49 | OptimCls = AdamWeightDecay 50 | else: 51 | raise ValueError('invalid optimizer') 52 | optimizer = OptimCls(group_params, 53 | learning_rate=lr, beta1=betas[0], beta2=betas[1]) 54 | return optimizer 55 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/train/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | MAX_TEXT_LEN = 48 17 | MAX_FULL_TEXT_LEN = 50 18 | MAX_IMG_LEN = 197 #448 19 | MAX_AUDIO_LEN = 50 20 | MAX_FULL_LEN = 297 #448 21 | MAX_DEFAULT_LEN = 50 22 | 23 | MAX_IMG_TEXT_LEN = 247 #448 24 | 25 | IMG_TOKEN_SIZE = 8192 26 | IMG_TOKEN_LEN = 64 27 | 28 | MAX_TEXT_GTS_LEN = 29 29 | MAX_IMG_GTS_LEN = 63 30 | 31 | MAX_MEL_LEN = 1289 32 | MAX_SRC_LEN = 89 33 | 34 | IMG_DIM = 768 35 | AUDIO_DIM = 1024 36 | 37 | IMG_LABEL_DIM = 1601 38 | AUDIO_LABEL_DIM = 1600 39 | 40 | MASK_SIZE=2 41 | N_NEGATIVES=10 42 | IMG_PATCH_SIZE=448 43 | 44 | MAX_TIME_STEPS=128 -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/train/config.yml: -------------------------------------------------------------------------------- 1 | model: 2 | name: 3 | num_hiddens: 256 4 | num_residual_layers: 4 5 | num_residual_hiddens: 256 6 | 7 | downsample: 8 8 | embedding_dim: 256 9 | num_embeddings: 8192 10 | commitment_cost: 0.25 11 | decay: 0.99 12 | 13 | dataset: 14 | name: Custom 15 | img_size: 256 16 | batchsize: 16 17 | buffersize: 5 18 | repeatsize: 1 19 | num_workers: 2 20 | data_dir: ./datasets/txt2img/mscoco/images 21 | train_ids_path: ./datasets/txt2img/mscoco/cocodata_zh/COCO_trainids_vqvae.json 22 | valid_ids_path: ./datasets/txt2img/mscoco/cocodata_zh/COCO_validids_vqvae.json 23 | 24 | loss: 25 | name: nMSE 26 | 27 | train: 28 | pretrain: ~ 29 | ckpt: ~ 30 | std_out: True 31 | 32 | lr: 0.0012 33 | num_epochs: 10 34 | num_workers: 2 35 | sink_size: 1000 36 | loss_scale: 4096 37 | optimize: ADAM 38 | keep_batchnorm_fp32: True 39 | exp_name: VQVAEwBN_MSCOCO 40 | exp_path: experiments/ 41 | 42 | num_log_steps: 10 43 | num_save_steps: 20000 44 | num_test_steps: 5000 45 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/train/data_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ dataloader """ 16 | 17 | import os 18 | 19 | 20 | class DataLoader: 21 | """ DataLoader """ 22 | 23 | def __init__(self, dataset, batch_sampler, collate_fn, is_train=True, device_num=256, drop_last=True): 24 | self.dataset = dataset 25 | self.batch_sampler = batch_sampler 26 | self.collat_fn = collate_fn 27 | self.device_num = device_num 28 | rank_id_str = os.getenv('RANK_ID', '0') 29 | self.rank_id = int(rank_id_str[rank_id_str.rfind('-') + 1:]) # 'RANK_ID': 'job24535502-job-facereidtome-hn-0/1' 30 | self.is_train = is_train 31 | self.drop_last = drop_last 32 | self.batch_size = len(next(iter(self.batch_sampler))) 33 | 34 | def __iter__(self): 35 | self.step_index = 0 36 | self.batch_indices = iter(self.batch_sampler) 37 | 38 | return self 39 | 40 | def __next__(self): 41 | 42 | if self.is_train: 43 | try: 44 | indices = next(self.batch_indices) 45 | if len(indices) != self.batch_size and self.drop_last: 46 | return self.__next__() 47 | except StopIteration: 48 | self.batch_indices = iter(self.batch_sampler) 49 | indices = next(self.batch_indices) 50 | data = [] 51 | per_batch = len(indices) // self.device_num 52 | index = indices[self.rank_id * per_batch:(self.rank_id + 1) * per_batch] 53 | for idx in index: 54 | data.append(self.dataset[idx]) 55 | 56 | data = self.collat_fn(data) 57 | return data 58 | else: 59 | indices = next(self.batch_indices) 60 | data = [] 61 | per_batch = len(indices) // self.device_num 62 | index = indices[self.rank_id * per_batch:(self.rank_id + 1) * per_batch] 63 | for idx in index: 64 | data.append(self.dataset[idx]) 65 | 66 | data = self.collat_fn(data) 67 | 68 | return data 69 | 70 | 71 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/train/generator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ generator """ 16 | from collections import defaultdict 17 | import numpy as np 18 | from mindspore import Tensor 19 | 20 | data_column = [ 21 | 'input_ids', 22 | 'input_mask', 23 | 'img', 24 | 't', 25 | 'weights' 26 | ] 27 | 28 | data_column_supres = [ 29 | 'input_ids', 30 | 'input_mask', 31 | 'img', 32 | 't', 33 | 'weights', 34 | 'low_res' 35 | ] 36 | 37 | data_column_audio = [ 38 | 'input_ids', 39 | 'position_ids', 40 | 'attention_mask', 41 | 'mel_targets', 42 | 'duration_targets', 43 | 'speakers', 44 | 'texts', 45 | 'src_lens', 46 | 'mel_lens', 47 | 'audio_max_text_len', 48 | 'audio_max_mel_len', 49 | 'pitch_targets', 50 | 'energy_targets' 51 | ] 52 | 53 | task2id = { 54 | 'mlmThree': 0, 55 | 'mrcThree': 1, 56 | 'mrfrThree': 2, 57 | 'mafrThree': 3, 58 | 'macThree': 4, 59 | "itmThree": 5, 60 | 'mrctThree': 6, 61 | "tdThree": 7, 62 | "idThree": 8, 63 | "adThree": 9, 64 | "ret": 10, 65 | "ftRet": 11 66 | } 67 | 68 | 69 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/train/resample.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | from abc import ABC, abstractmethod 17 | import numpy as np 18 | 19 | 20 | def create_named_schedule_sampler(name,timesteps): 21 | """ 22 | Create a ScheduleSampler from a library of pre-defined samplers. 23 | 24 | :param name: the name of the sampler. 25 | :param diffusion: the diffusion object to sample for. 26 | """ 27 | 28 | if name == "uniform": 29 | return UniformSampler(timesteps) 30 | else: 31 | raise NotImplementedError(f"unknown schedule sampler: {name}") 32 | 33 | 34 | class ScheduleSampler(ABC): 35 | """ 36 | A distribution over timesteps in the diffusion process, intended to reduce 37 | variance of the objective. 38 | 39 | By default, samplers perform unbiased importance sampling, in which the 40 | objective's mean is unchanged. 41 | However, subclasses may override sample() to change how the resampled 42 | terms are reweighted, allowing for actual changes in the objective. 43 | """ 44 | 45 | @abstractmethod 46 | def weights(self): 47 | """ 48 | Get a numpy array of weights, one per diffusion step. 49 | 50 | The weights needn't be normalized, but must be positive. 51 | """ 52 | 53 | def sample(self, batch_size): 54 | """ 55 | Importance-sample timesteps for a batch. 56 | 57 | :param batch_size: the number of timesteps. 58 | :param device: the torch device to save to. 59 | :return: a tuple (timesteps, weights): 60 | - timesteps: a tensor of timestep indices. 61 | - weights: a tensor of weights to scale the resulting losses. 62 | """ 63 | w = self.weights() 64 | p = w / np.sum(w) 65 | indices = np.random.choice(len(p), size=(batch_size,), p=p) 66 | weights = 1 / (len(p) * p[indices]) 67 | return indices, weights 68 | 69 | 70 | class UniformSampler(ScheduleSampler): 71 | def __init__(self, timesteps): 72 | #self.diffusion = diffusion 73 | self._weights = np.ones([timesteps]) 74 | 75 | def weights(self): 76 | return self._weights 77 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/train/sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ 16 | sampler for length bucketing (batch by tokens) 17 | """ 18 | import random 19 | import gc 20 | 21 | class EasySampler: 22 | """ 23 | Sampler for token bucket path 24 | """ 25 | 26 | def __init__(self, dataset, batch_size, device_num=1): 27 | self.dataset = dataset 28 | self.per_batch = batch_size * device_num 29 | 30 | def _create_ids(self): 31 | return list(range(len(self.dataset))) 32 | 33 | def __iter__(self): 34 | ids = self._create_ids() 35 | random.shuffle(ids) 36 | batches = [ids[i:i + self.per_batch] for i in range(0, len(ids) - self.per_batch, self.per_batch)] 37 | return iter(batches) 38 | 39 | def __len__(self): 40 | raise ValueError("NOT supported. " 41 | "This has some randomness across epochs") 42 | 43 | 44 | class BatchSampler: 45 | """ 46 | Batch Sampler 47 | """ 48 | 49 | def __init__(self, lens, batch_size, device_num): 50 | self._lens = lens 51 | self._batch_size = batch_size * device_num 52 | 53 | def _create_ids(self): 54 | return list(range(self._lens)) 55 | 56 | def __iter__(self): 57 | ids = self._create_ids() 58 | batches = [ids[i:i + self._batch_size] for i in range(0, len(ids), self._batch_size)] 59 | gc.collect() 60 | return iter(batches) 61 | 62 | def __len__(self): 63 | raise ValueError("NOT supported. " 64 | "This has some randomness across epochs") 65 | 66 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/train/t2ids.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ 16 | TextToImage Datasets 17 | """ 18 | from toolz.sandbox import unzip 19 | import os 20 | import json 21 | import numpy as np 22 | import mindspore.dataset.vision.c_transforms as C 23 | from mindspore.dataset.vision.utils import Inter 24 | from PIL import Image 25 | 26 | 27 | def pad_tensors(tensors, lens=None, pad=0, max_len=50): 28 | """B x [T, ...]""" 29 | if lens is None: 30 | lens = [t.shape[0] for t in tensors] 31 | if max_len == -1: 32 | max_len = max(lens) 33 | bs = len(tensors) 34 | hid = tensors[0].shape[-1] 35 | dtype = tensors[0].dtype 36 | output = np.zeros((bs, max_len, hid), dtype=dtype) 37 | if pad: 38 | output.fill(pad) 39 | for i, (t, l) in enumerate(zip(tensors, lens)): 40 | output[i, :l, ...] = t 41 | return output 42 | 43 | def pad_tensors_pos(tensors, lens, feat, max_len=50): 44 | """ pad_tensors_pos """ 45 | if tensors is None or tensors[0] is None: 46 | return np.expand_dims(np.arange(0, feat.shape[1], dtype=np.int64), 0) 47 | return pad_tensors(tensors, lens, max_len=max_len) 48 | 49 | def get_ids_three(ids_path): 50 | ids = json.load(open(ids_path)) 51 | size, rank = get_size_rank() 52 | return ids[rank::size] 53 | 54 | def get_size_rank(): 55 | size, rank = 1, 0 56 | return size, rank 57 | 58 | def pad_sequence(sequences, batch_first=True, padding_value=0.0, max_lens=50): 59 | """pad_sequence""" 60 | lens = [len(x) for x in sequences] 61 | if max_lens == -1: 62 | max_lens = max(lens) 63 | 64 | padded_seq = [] 65 | for x in sequences: 66 | pad_width = [(0, max_lens - len(x))] 67 | padded_seq.append(np.pad(x, pad_width, constant_values=(padding_value, padding_value))) 68 | 69 | sequences = np.stack(padded_seq, axis=0 if batch_first else 1) 70 | return sequences 71 | 72 | 73 | def pad_sequence_(sequences, batch_first=False, padding_value=0.0, max_lens=50): 74 | """pad_sequence""" 75 | if sequences[0] is None: 76 | return None 77 | return pad_sequence(sequences, batch_first, padding_value, max_lens) 78 | 79 | def t2i_collate(inputs): 80 | """ 81 | Return: 82 | :input_ids (n, max_L) padded with 0 83 | :position_ids (n, max_L) padded with 0 84 | :txt_lens list of [txt_len] 85 | :img_feat (n, max_num_bb, feat_dim) 86 | :img_pos_feat (n, max_num_bb, 7) 87 | :num_bbs list of [num_bb] 88 | :attn_masks (n, max_{L + num_bb}) padded with 0 89 | :txt_labels (n, max_L) padded with -1 90 | :audio_feat (n, audio_size, audio_dim) 91 | """ 92 | img_feat, input_ids, input_mask, t, weights = map(list, unzip(inputs)) 93 | 94 | 95 | batch = { 96 | 'input_ids': input_ids, 97 | #'position_ids': position_ids, 98 | 'input_mask': input_mask, 99 | 'img_feat': img_feat, 100 | 't': t, 101 | 'weights': weights 102 | } 103 | return batch 104 | 105 | def t2i_collate_supres(inputs): 106 | """ 107 | Return: 108 | datas 109 | """ 110 | img_feat, input_ids, input_mask, t, weights, low_res = map(list, unzip(inputs)) 111 | 112 | 113 | 114 | batch = { 115 | 'input_ids': input_ids, 116 | 'input_mask': input_mask, 117 | 'img_feat': img_feat, 118 | 't': t, 119 | 'weights': weights, 120 | 'low_res': low_res, 121 | } 122 | return batch -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_text2im/train/train_util.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import functools 3 | 4 | import blobfile as bf 5 | from PIL import Image 6 | 7 | 8 | def save_images(batch: th.Tensor, path: str): 9 | """ Display a batch of images inline. """ 10 | scaled = ((batch + 1) * 127.5).round().clamp(0, 255).to(th.uint8).cpu() 11 | reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3]) 12 | Image.fromarray(reshaped.numpy()).save(path, quality=100) 13 | 14 | # For ImageNet experiments, this was a good default value. 15 | # We found that the lg_loss_scale quickly climbed to 16 | # 20-21 within the first ~1K steps of training. 17 | INITIAL_LOG_LOSS_SCALE = 20.0 18 | 19 | class TrainLoop: 20 | def __init__( 21 | self, 22 | *, 23 | model, 24 | diffusion, 25 | data, 26 | batch_size, 27 | microbatch, 28 | lr, 29 | ema_rate, 30 | log_interval, 31 | save_interval, 32 | resume_checkpoint, 33 | use_fp16=False, 34 | fp16_scale_growth=1e-3, 35 | schedule_sampler=None, 36 | weight_decay=0.0, 37 | lr_anneal_steps=0, 38 | ): 39 | self.model = model 40 | self.diffusion = diffusion 41 | self.data = data 42 | self.batch_size = batch_size 43 | 44 | self.microbatch = microbatch if microbatch > 0 else batch_size 45 | self.lr = lr 46 | 47 | self.ema_rate = ( 48 | [ema_rate] 49 | if isinstance(ema_rate, float) 50 | else [float(x) for x in ema_rate.split(",")] 51 | ) 52 | 53 | self.log_interval = log_interval 54 | self.save_interval = save_interval 55 | self.resume_checkpoint = resume_checkpoint 56 | 57 | self.use_fp16 = use_fp16 58 | self.fp16_scale_growth = fp16_scale_growth 59 | self.schedule_sampler = schedule_sampler or UniformSampler(diffusion) 60 | 61 | self.weight_decay = weight_decay 62 | self.lr_anneal_steps = lr_anneal_steps 63 | 64 | self.step = 0 65 | self.resume_step = 0 66 | self.global_batch = self.batch_size 67 | 68 | 69 | self.sync_cuda = th.cuda.is_available() 70 | 71 | self._load_and_sync_parameters() 72 | self.mp_trainer = MixedPrecisionTrainer( 73 | model=self.model, 74 | use_fp16=self.use_fp16, 75 | fp16_scale_growth=fp16_scale_growth, 76 | ) 77 | 78 | self.opt = AdamW( 79 | self.mp_trainer.master_params, lr=self.lr, weight_decay=self.weight_decay 80 | ) 81 | 82 | if self.resume_step: 83 | self._load_optimizer_state() 84 | # Model was resumed, either due to a restart or a checkpoint 85 | # being specified at the command line. 86 | self.ema_params = [ 87 | self._load_ema_parameters(rate) for rate in self.ema_rate 88 | ] 89 | else: 90 | self.ema_params = [ 91 | copy.deepcopy(self.mp_trainer.master_params) 92 | for _ in range(len(self.ema_rate)) 93 | ] 94 | 95 | if th.cuda.is_available(): 96 | self.use_ddp = True 97 | self.ddp_model = DDP( 98 | self.model, 99 | device_ids=[dist_util.dev()], 100 | output_device=dist_util.dev(), 101 | broadcast_buffers=False, 102 | bucket_cap_mb=128, 103 | find_unused_parameters=False, 104 | ) 105 | else: 106 | self.use_ddp = False 107 | self.ddp_model = self.model 108 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_utils/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | curPath = os.path.abspath(os.path.dirname(__file__)) 4 | sys.path.append(curPath) 5 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_utils/callbackConfig.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import mindspore as ms 17 | 18 | class StopAtStep(ms.Callback): 19 | def __init__(self, start_step, stop_step, profiler): 20 | super(StopAtStep, self).__init__() 21 | self.start_step = start_step 22 | self.stop_step = stop_step 23 | self.profiler = profiler 24 | def step_begin(self, run_context): 25 | cb_params = run_context.original_args() 26 | step_num = cb_params.cur_step_num 27 | if step_num == self.start_step: 28 | self.profiler.start() 29 | def step_end(self, run_context): 30 | cb_params = run_context.original_args() 31 | step_num = cb_params.cur_step_num 32 | if step_num == self.stop_step: 33 | self.profiler.stop() 34 | def end(self, run_context): 35 | self.profiler.analyse() 36 | 37 | class StopAtEpoch(ms.Callback): 38 | def __init__(self, start_epoch, stop_epoch, profiler): 39 | super(StopAtEpoch, self).__init__() 40 | self.start_epoch = start_epoch 41 | self.stop_epoch = stop_epoch 42 | self.profiler = profiler 43 | def epoch_begin(self, run_context): 44 | cb_params = run_context.original_args() 45 | epoch_num = cb_params.cur_epoch_num 46 | if epoch_num == self.start_epoch: 47 | self.profiler.start() 48 | def epoch_end(self, run_context): 49 | cb_params = run_context.original_args() 50 | epoch_num = cb_params.cur_epoch_num 51 | if epoch_num == self.stop_epoch: 52 | self.profiler.stop() 53 | def end(self, run_context): 54 | self.profiler.analyse() 55 | 56 | 57 | ##moxing callback 58 | class UploadObs(ms.Callback): 59 | def __init__(self, ckpt_dir, upload_url, ckpt_prefix="") -> None: 60 | super(UploadObs, self).__init__() 61 | self.ckpt_dir = ckpt_dir 62 | self.upload_url = upload_url 63 | self.ckpt_prefix = ckpt_prefix 64 | 65 | def epoch_end(self, run_context): 66 | cb_params = run_context.original_args() 67 | print("cb_params", cb_params) 68 | cur_epoch_num = cb_params.get("cur_epoch_num", 1) 69 | cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 70 | ckpt_name = self.ckpt_prefix + "-" + str(cur_epoch_num) + "_" + str(cur_step_in_epoch) + ".ckpt" 71 | ckpt_path = os.path.join(self.ckpt_dir, ckpt_name) 72 | moxing.file.copy(ckpt_path, self.upload_url) 73 | 74 | 75 | class GetParametersEpoch(ms.Callback): 76 | def __init__(self) -> None: 77 | super(GetParametersEpoch, self).__init__() 78 | 79 | 80 | def epoch_end(self, run_context): 81 | cb_params = run_context.original_args() 82 | train_net = cb_params.get("train_net") 83 | 84 | class OverflowMonitor(ms.Callback): 85 | def step_end(self, run_context): 86 | cb_params = run_context.original_args() 87 | cur_epoch_num = cb_params.get("cur_epoch_num", 1) 88 | cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 89 | overflow = cb_params.net_outputs[1] 90 | if overflow: 91 | print(f"overflow detected in epoch {cur_epoch_num} step {cur_step_in_epoch}") 92 | return super().step_end(run_context) -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_utils/img_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import mindspore 17 | from typing import Tuple 18 | import numpy as np 19 | from PIL import Image 20 | from IPython.display import display 21 | 22 | 23 | def read_image(path: str, size: int = 256) -> Tuple[mindspore.Tensor, mindspore.Tensor]: 24 | pil_img = Image.open(path).convert('RGB') 25 | pil_img = pil_img.resize((size, size), resample=Image.BICUBIC) 26 | img = np.array(pil_img) 27 | # print("img.shape", img.shape) [64, 64, 3] 28 | dimmed = mindspore.Tensor(img)[None] 29 | reshaped = dimmed.transpose((0, 3, 1, 2)) 30 | reshaped = mindspore.ops.Cast()(reshaped, mindspore.float32) 31 | scaled = mindspore.ops.Add()(mindspore.ops.Div()(reshaped, 127.5), -1) 32 | return scaled 33 | 34 | 35 | def get_img(batch: mindspore.Tensor): 36 | batch_plus = mindspore.ops.Add()(batch, 1) 37 | scaled = mindspore.ops.Mul()(batch_plus, 127.5) 38 | rounded_scaled = mindspore.ops.Rint()(scaled) 39 | clipped_scaled = mindspore.ops.clip_by_value(rounded_scaled, mindspore.Tensor(0), mindspore.Tensor(255)) 40 | clipped_scaled = clipped_scaled.transpose((2, 0, 3, 1)) 41 | clipped_scaled = mindspore.ops.Cast()(clipped_scaled, mindspore.uint8) 42 | reshaped = clipped_scaled.reshape(([batch.shape[2], -1, 3])) 43 | return reshaped 44 | 45 | 46 | def show_images(batch: mindspore.Tensor): 47 | """ Display a batch of images inline. """ 48 | display(Image.fromarray(get_img(batch).asnumpy())) 49 | 50 | 51 | def save_images(batch: mindspore.Tensor, path: str): 52 | """ Display a batch of images inline. """ 53 | batch_32 = mindspore.ops.Cast()(batch, mindspore.float32) 54 | Image.fromarray(get_img(batch_32).asnumpy()).save(path, quality=100, subsampling=0) 55 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_utils/moxing_adapter.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """Moxing adapter for ModelArts""" 17 | 18 | import os 19 | import functools 20 | from mindspore import context 21 | from mindspore.profiler import Profiler 22 | from src.model_utils.config import config 23 | 24 | _global_sync_count = 0 25 | 26 | def get_device_id(): 27 | device_id = os.getenv('DEVICE_ID', '0') 28 | return int(device_id) 29 | 30 | 31 | def get_device_num(): 32 | device_num = os.getenv('RANK_SIZE', '1') 33 | return int(device_num) 34 | 35 | 36 | def get_rank_id(): 37 | global_rank_id = os.getenv('RANK_ID', '0') 38 | return int(global_rank_id) 39 | 40 | 41 | def get_job_id(): 42 | job_id = os.getenv('JOB_ID') 43 | job_id = job_id if job_id != "" else "default" 44 | return job_id 45 | 46 | def sync_data(from_path, to_path): 47 | """ 48 | Download data from remote obs to local directory if the first url is remote url and the second one is local path 49 | Upload data from local directory to remote obs in contrast. 50 | """ 51 | import moxing as mox 52 | import time 53 | global _global_sync_count 54 | sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) 55 | _global_sync_count += 1 56 | 57 | # Each server contains 8 devices as most. 58 | if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): 59 | print("from path: ", from_path) 60 | print("to path: ", to_path) 61 | mox.file.copy_parallel(from_path, to_path) 62 | print("===finish data synchronization===") 63 | try: 64 | os.mknod(sync_lock) 65 | # print("os.mknod({}) success".format(sync_lock)) 66 | except IOError: 67 | pass 68 | print("===save flag===") 69 | 70 | while True: 71 | if os.path.exists(sync_lock): 72 | break 73 | time.sleep(1) 74 | 75 | print("Finish sync data from {} to {}.".format(from_path, to_path)) 76 | 77 | 78 | def moxing_wrapper(pre_process=None, post_process=None): 79 | """ 80 | Moxing wrapper to download dataset and upload outputs. 81 | """ 82 | def wrapper(run_func): 83 | @functools.wraps(run_func) 84 | def wrapped_func(*args, **kwargs): 85 | # Download data from data_url 86 | if config.enable_modelarts: 87 | if config.data_url: 88 | sync_data(config.data_url, config.data_path) 89 | print("Dataset downloaded: ", os.listdir(config.data_path)) 90 | if config.checkpoint_url: 91 | sync_data(config.checkpoint_url, config.load_path) 92 | print("Preload downloaded: ", os.listdir(config.load_path)) 93 | if config.train_url: 94 | sync_data(config.train_url, config.output_path) 95 | print("Workspace downloaded: ", os.listdir(config.output_path)) 96 | 97 | context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) 98 | config.device_num = get_device_num() 99 | config.device_id = get_device_id() 100 | if not os.path.exists(config.output_path): 101 | os.makedirs(config.output_path) 102 | 103 | if pre_process: 104 | pre_process() 105 | 106 | if config.enable_profiling: 107 | profiler = Profiler() 108 | 109 | run_func(*args, **kwargs) 110 | 111 | if config.enable_profiling: 112 | profiler.analyse() 113 | 114 | # Upload data to train_url 115 | if config.enable_modelarts: 116 | if post_process: 117 | post_process() 118 | 119 | if config.train_url: 120 | print("Start to copy output directory") 121 | sync_data(config.output_path, config.train_url) 122 | return wrapped_func 123 | return wrapper 124 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model/glide_utils/parallelConfig.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import numpy as np 17 | import mindspore.common.dtype as mstype 18 | from mindspore import context 19 | from mindspore import nn 20 | from mindspore._checkparam import Validator 21 | from mindspore._extends import cell_attr_register 22 | from mindspore.common.initializer import initializer 23 | from mindspore.common.parameter import Parameter 24 | from mindspore.common.seed import _get_graph_seed 25 | from mindspore.common.tensor import Tensor 26 | from mindspore.context import ParallelMode 27 | 28 | 29 | class ParallelConfig: 30 | r""" 31 | ParallelConfig for the setting the global data parallel, model parallel and fusion group. 32 | """ 33 | dp = 8 34 | mp = 1 35 | pipeline_stage = 1 36 | recompute = False 37 | optimizer_shard = False 38 | fusion_group = 1 39 | parallel_mode = ParallelMode.SEMI_AUTO_PARALLEL 40 | vocab_emb_dp = False 41 | ep = dp 42 | capacity_factor = 1.5 43 | expert_num = 32 44 | aux_loss_factor = 0.01 45 | 46 | @staticmethod 47 | def set_global_parallel_config(dp=1, 48 | mp=1, 49 | recompute=True, 50 | stages=1, 51 | optimizer_shard=True, 52 | fusion_group=4, 53 | parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, 54 | vocab_emb_dp=True): 55 | r""" 56 | The parallel configure setting 57 | 58 | Args: 59 | dp (int): The data parallel way. Default: 1 60 | mp (int): The model parallel way. Default: 1 61 | stages (int): The number of the pipeline stage. Should be a positive value. Default: 1. 62 | optimizer_shard (bool): Enable optimizer state sharding or not. Default: True. 63 | fusion_group (int): The fusion group size of the optimizer state sharding. Default: 4. 64 | recompute (bool): Enable recomputation of the transformer block or not. Default: False. 65 | parallel_mode (ParallelMode): Can be SEMI_AUTO_PARALLEL, DATA_AUTO_PARALLEL or AUTO_PARALLEL. 66 | vocab_emb_dp (bool): Shard embedding in model parallel or data parallel. Default: True 67 | 68 | Supported Platforms: 69 | ``Ascend`` ``GPU`` 70 | 71 | Examples: 72 | >>> ParallelConfig(dp=1, mp=1) 73 | >>> ParallelConfig(stages=4) 74 | """ 75 | ParallelConfig.dp = dp 76 | ParallelConfig.mp = mp 77 | ParallelConfig.pipeline_stage = stages 78 | ParallelConfig.optimizer_shard = optimizer_shard 79 | ParallelConfig.fusion_group = fusion_group 80 | ParallelConfig.recompute = recompute 81 | ParallelConfig.parallel_mode = parallel_mode 82 | ParallelConfig.vocab_emb_dp = vocab_emb_dp -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model_configs/model_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "image_size":64, 3 | "num_channels":192, 4 | "num_res_blocks":3, 5 | "channel_mult":[1, 2, 3, 4], 6 | "num_heads":1, 7 | "num_head_channels":64, 8 | "num_heads_upsample":-1, 9 | "attention_resolutions":[2, 4, 8], 10 | "dropout":0.9, 11 | "text_ctx":128, 12 | "xf_width":512, 13 | "xf_layers":16, 14 | "xf_heads":8, 15 | "xf_final_ln":true, 16 | "n_vocab":50001, 17 | "xf_padding":true, 18 | "diffusion_steps":1000, 19 | "noise_schedule":"squaredcos_cap_v2", 20 | "timestep_respacing":"60", 21 | "use_scale_shift_norm":true, 22 | "resblock_updown":true, 23 | "use_fp16":true, 24 | "cache_text_emb":false, 25 | "inpaint":false, 26 | "super_res":false, 27 | "chinese":true, 28 | "sketch":false, 29 | "class_balanced":false, 30 | "sketch_classes":0 31 | } -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/model_configs/supres_model_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "image_size":256, 3 | "num_channels":192, 4 | "num_res_blocks":2, 5 | "channel_mult":[1,1,2,2,4,4], 6 | "num_heads":1, 7 | "num_head_channels":64, 8 | "num_heads_upsample":-1, 9 | "attention_resolutions":[32, 16, 8], 10 | "dropout":0.0, 11 | "text_ctx":128, 12 | "xf_width":512, 13 | "xf_layers":16, 14 | "xf_heads":8, 15 | "xf_final_ln":true, 16 | "n_vocab":50257, 17 | "xf_padding":true, 18 | "diffusion_steps":1000, 19 | "noise_schedule":"linear", 20 | "timestep_respacing":"fast27", 21 | "use_scale_shift_norm":true, 22 | "resblock_updown":true, 23 | "use_fp16":true, 24 | "cache_text_emb":false, 25 | "inpaint":false, 26 | "super_res":false, 27 | "chinese":false, 28 | "sketch":false, 29 | "class_balanced":false, 30 | "sketch_classes":0 31 | } -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/requirements.txt: -------------------------------------------------------------------------------- 1 | ipython 2 | regex 3 | sentencepiece 4 | blobfile 5 | toolz 6 | tqdm 7 | pathlib2 -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/scripts/run_gen_finetune_dist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | output_dir="output/" 4 | task_name="text2image_parallel" 5 | 6 | if [ $# != 3 ] 7 | then 8 | echo "Usage: 9 | bash scripts/train_caption_parallel.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE]" 10 | exit 1 11 | fi 12 | 13 | if [ $1 -lt 1 ] || [ $1 -gt 8 ] 14 | then 15 | echo "error: DEVICE_NUM=$1 is not in [1,8]" 16 | exit 1 17 | fi 18 | 19 | VISIABLE_DEVICES=$2 20 | IFS="," read -r -a CANDIDATE_DEVICE <<< "$VISIABLE_DEVICES" 21 | if [ ${#CANDIDATE_DEVICE[@]} -ne $1 ] 22 | then 23 | echo "error: DEVICE_NUM=$1 is not matched with VISIABLE_DEVICES=$2" 24 | exit 1 25 | fi 26 | 27 | if [ ! -f $3 ] 28 | then 29 | echo "error: RANK_TABLE_FILE=$3 is not a file" 30 | exit 1 31 | fi 32 | 33 | export GLOG_v=3 34 | export ASCEND_GLOBAL_LOG_LEVEL=3 35 | export ASCEND_GLOBAL_EVENT_ENABLE=0 36 | export ASCEND_SLOG_PRINT_TO_STDOUT=1 37 | export HCCL_CONNECT_TIMEOUT=600 38 | 39 | #ulimit -u unlimited 40 | ulimit -SHn 65535 41 | export DEVICE_NUM=$1 42 | export RANK_SIZE=$1 43 | RANK_TABLE_FILE=$(realpath $3) 44 | export RANK_TABLE_FILE 45 | echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}" 46 | 47 | rm -rf ${output_dir:?}/${task_name:?} 48 | mkdir -p ${output_dir:?}/${task_name:?} 49 | export MS_COMPILER_CACHE_PATH=${output_dir:?}/${task_name:?} 50 | export SERVER_ID=0 51 | rank_start=$((DEVICE_NUM * SERVER_ID)) 52 | 53 | for((i=0; i<${RANK_SIZE}; i++)) 54 | do 55 | export RANK_ID=$((rank_start + i)) 56 | export DEVICE_ID=${CANDIDATE_DEVICE[i]} 57 | mkdir -p ${output_dir:?}/${task_name:?}/rank_$i 58 | echo "start training for rank $RANK_ID, device $DEVICE_ID" 59 | nohup python -u src/train_txt2img.py \ 60 | --data_path=/glide/dataset/ \ 61 | --output_path=/glide/output/ \ 62 | --pretrained_model_path=/glide/pretraind_models/ \ 63 | --is_chinese=True \ 64 | --use_parallel=True \ 65 | --pretrained_model="glide_gen.ckpt" \ 66 | --cog_model="cog-pretrain.model" \ 67 | --model_config=./model_configs/model_config.json \ 68 | --image_caption_path_file="image_caption_path_file.txt" \ 69 | --save_checkpoint_steps=1000 \ 70 | --batch_size=2 \ 71 | --epochs=10 \ 72 | --start_learning_rate=1e-4 \ 73 | --end_learning_rate=1e-9 \ 74 | > $output_dir/$task_name/rank_$i/log_train 2>&1 & 75 | done 76 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/scripts/run_infer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # -*- coding: UTF-8 -*- 3 | # Copyright 2022 Huawei Technologies Co., Ltd 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================ 17 | 18 | output_path=/glide/glide/output/ 19 | ckpt_path=/glide/pretraind_models/ 20 | model_config_path=/glide/configs/infer_model_config_glide.yaml 21 | is_chinese=True 22 | denoise_steps=60 23 | super_res_step=27 24 | pics_generated=4 25 | tokenizer_model="cog-pretrain.model" 26 | gen_ckpt="glide_gen.ckpt" 27 | super_ckpt="glide_super_res.ckpt" 28 | srgan_ckpt="srgan.ckpt" 29 | prompts_file=./data/prompts.txt 30 | 31 | python src/txt2img.py \ 32 | --output_path=$output_path \ 33 | --ckpt_path=$ckpt_path \ 34 | --model_config_path=$model_config_path \ 35 | --is_chinese=$is_chinese \ 36 | --denoise_steps=$denoise_steps \ 37 | --super_res_step=$super_res_step \ 38 | --pics_generated=$pics_generated \ 39 | --tokenizer_model=$tokenizer_model \ 40 | --gen_ckpt=$gen_ckpt \ 41 | --super_ckpt=$super_ckpt \ 42 | --srgan_ckpt=$srgan_ckpt \ 43 | --prompts_file=$prompts_file \ 44 | 45 | 46 | -------------------------------------------------------------------------------- /vision/Taichu-GLIDE/scripts/run_super_res_finetune_dist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | output_dir="output/" 4 | task_name="text2image_parallel" 5 | 6 | if [ $# != 3 ] 7 | then 8 | echo "Usage: 9 | bash scripts/train_caption_parallel.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE]" 10 | exit 1 11 | fi 12 | 13 | if [ $1 -lt 1 ] || [ $1 -gt 8 ] 14 | then 15 | echo "error: DEVICE_NUM=$1 is not in [1,8]" 16 | exit 1 17 | fi 18 | 19 | VISIABLE_DEVICES=$2 20 | IFS="," read -r -a CANDIDATE_DEVICE <<< "$VISIABLE_DEVICES" 21 | if [ ${#CANDIDATE_DEVICE[@]} -ne $1 ] 22 | then 23 | echo "error: DEVICE_NUM=$1 is not matched with VISIABLE_DEVICES=$2" 24 | exit 1 25 | fi 26 | 27 | if [ ! -f $3 ] 28 | then 29 | echo "error: RANK_TABLE_FILE=$3 is not a file" 30 | exit 1 31 | fi 32 | 33 | export GLOG_v=3 34 | export ASCEND_GLOBAL_LOG_LEVEL=3 35 | export ASCEND_GLOBAL_EVENT_ENABLE=0 36 | export ASCEND_SLOG_PRINT_TO_STDOUT=1 37 | export HCCL_CONNECT_TIMEOUT=600 38 | 39 | #ulimit -u unlimited 40 | ulimit -SHn 65535 41 | export DEVICE_NUM=$1 42 | export RANK_SIZE=$1 43 | RANK_TABLE_FILE=$(realpath $3) 44 | export RANK_TABLE_FILE 45 | echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}" 46 | 47 | rm -rf ${output_dir:?}/${task_name:?} 48 | mkdir -p ${output_dir:?}/${task_name:?} 49 | export MS_COMPILER_CACHE_PATH=${output_dir:?}/${task_name:?} 50 | export SERVER_ID=0 51 | rank_start=$((DEVICE_NUM * SERVER_ID)) 52 | 53 | for((i=0; i<${RANK_SIZE}; i++)) 54 | do 55 | export RANK_ID=$((rank_start + i)) 56 | export DEVICE_ID=${CANDIDATE_DEVICE[i]} 57 | mkdir -p ${output_dir:?}/${task_name:?}/rank_$i 58 | echo "start training for rank $RANK_ID, device $DEVICE_ID" 59 | nohup python -u src/train_txt2img.py \ 60 | --data_path=/glide/dataset/ \ 61 | --output_path=/glide/output/ \ 62 | --pretrained_model_path=/glide/pretraind_models/ \ 63 | --is_super_res=True \ 64 | --is_chinese=True \ 65 | --use_parallel=True \ 66 | --pretrained_model="glide_super_res.ckpt" \ 67 | --cog_model="cog-pretrain.model" \ 68 | --model_config=./model_configs/supres_model_config.json \ 69 | --image_caption_path_file="image_caption_path_file.txt" \ 70 | --save_checkpoint_steps=1000 \ 71 | --batch_size=2 \ 72 | --epochs=2 \ 73 | --start_learning_rate=1e-4 \ 74 | --end_learning_rate=1e-9 \ 75 | > $output_dir/$task_name/rank_$i/log_train 2>&1 & 76 | done 77 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Stablediffusionv2 3 | 4 | ## 目录 5 | 6 | - [Stablediffusionv2模型](#Stablediffusionv2模型) 7 | - [环境依赖](#环境依赖) 8 | - [快速开始](#快速开始) 9 | - [准备checkpoint](#准备checkpoint) 10 | - [文图生成](#文图生成) 11 | - [生成样例](#生成样例) 12 | 13 | ## Stablediffusionv2模型 14 | 15 | Stablediffusionv2模型由**Stability-AI**团队研发,由**华为昇腾**部门适配Mindspore+Ascend环境实现。 16 | 17 | ## 环境依赖 18 | 19 | 1. **昇腾软硬件解决方案(驱动+固件+CANN)** 20 | 21 | 前往[昇腾社区](),按照说明下载安装。 22 | 23 | 2. AI框架 - **MindSpore** == 1.9 24 | 25 | 前往[MindSpore官网](),按照说明下载安装。 26 | 27 | 如需更多帮助,可以参考以下资料 28 | 29 | - [MindSpore 教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html) 30 | - [MindSpore Python API](https://www.mindspore.cn/docs/zh-CN/master/index.html) 31 | 32 | 3. **第三方依赖** 33 | 34 | ```python 35 | pip install -r requirements.txt 36 | ``` 37 | 38 | ## 快速开始 39 | 40 | ### 准备checkpoint 41 | 42 | 下载stablediffusionv2预训练参数 [stablediffusionv2_512.ckpt](https://download.mindspore.cn/toolkits/minddiffusion/stablediffusion/stablediffusionv2_512.ckpt) 至 stablediffusionv2/models/ 目录. 43 | 44 | ### 文图生成 45 | 46 | 要进行文图生成,可以运行txt2img.py 或者直接使用默认参数运行 infer.sh. 47 | 48 | ```shell 49 | python txt2img.py --prompt [input text] --ckpt_path [ckpt_path] --ckpt_name [ckpt_name] \ 50 | --H [image_height] --W [image_width] --output_path [image save folder] \ 51 | --n_samples [number of images to generate] 52 | ``` 53 | 或者 54 | ```shell 55 | bash scripts/infer.sh 56 | ``` 57 | 58 | 更高的分辨率需要更大的显存. 对于 Ascend 910 芯片, 我们可以同时生成8张512x512的图片。 59 | 60 | 61 | ### 生成样例 62 | 63 | 下面是我们的stablediffusionv2模型生成的一些样例以及对应的`[input text]`。 64 | 65 | ``` 66 | A Van Gogh style oil painting of sunflower 67 | ``` 68 | 69 | ![A Van Gogh style oil painting of sunflower](demo/sunflower1.png) 70 | 71 | ``` 72 | A Van Gogh style oil painting of sunflower 73 | ``` 74 | 75 | ![A Van Gogh style oil painting of sunflower](demo/sunflower2.png) 76 | 77 | ``` 78 | a professional photograph of an astronaut riding a horse 79 | ``` 80 | 81 | ![A Van Gogh style oil painting of starry sky](demo/horse1.png) 82 | 83 | ``` 84 | a professional photograph of an astronaut riding a horse 85 | ``` 86 | 87 | ![A Van Gogh style oil painting of starry sky](demo/horse2.png) 88 | 89 | ``` 90 | The beautiful night view of the city has various buildings, traffic flow, and lights. 91 | ``` 92 | 93 | ![A Van Gogh style oil painting of starry sky](demo/city1.png) 94 | 95 | ``` 96 | The beautiful night view of the city has various buildings, traffic flow, and lights. 97 | ``` 98 | 99 | ![A Van Gogh style oil painting of starry sky](demo/city2.png) 100 | 101 | ``` 102 | Modernist style, sunset, withered vines, old trees, and mountains 103 | ``` 104 | 105 | ![A Van Gogh style oil painting of starry sky](demo/tree1.png) 106 | 107 | ``` 108 | Modernist style, sunset, withered vines, old trees, and mountains 109 | ``` 110 | 111 | ![A Van Gogh style oil painting of starry sky](demo/tree2.png) 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/__init__.py -------------------------------------------------------------------------------- /vision/stablediffusionv2/configs/train_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "configs/v1-train-chinese.yaml", 3 | "pretrained_model_path": "models/", 4 | "pretrained_model_file":"wukong-huahua-ms.ckpt", 5 | "data_path": "/secHome/FFHQ", 6 | "train_batch_size": 3, 7 | "gradient_accumulation_steps": 1, 8 | "optim": "adamw", 9 | "patch_size":32, 10 | "epochs": 20, 11 | "betas": [ 12 | 0.9, 13 | 0.98 14 | ], 15 | "dropout": 0.1, 16 | "weight_decay": 0.01, 17 | "warmup_steps": 1000, 18 | "seed": 3407, 19 | "image_size": 512, 20 | "image_filter_size": 256, 21 | "random_crop": false, 22 | "filter_small_size": true, 23 | "start_learning_rate": 1e-5, 24 | "end_learning_rate": 1e-7, 25 | "decay_steps": 0, 26 | "save_checkpoint_steps": 10000 27 | } 28 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/configs/v2-inference.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-04 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.00085 6 | linear_end: 0.0120 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: "jpg" 11 | cond_stage_key: "txt" 12 | image_size: 64 13 | channels: 4 14 | cond_stage_trainable: false # Note: different from the one we trained before 15 | conditioning_key: crossattn 16 | monitor: val/loss_simple_ema 17 | scale_factor: 0.18215 18 | use_ema: False 19 | use_fp16: True 20 | 21 | unet_config: 22 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 23 | params: 24 | image_size: 32 # unused 25 | in_channels: 4 26 | out_channels: 4 27 | model_channels: 320 28 | attention_resolutions: [ 4, 2, 1 ] 29 | num_res_blocks: 2 30 | channel_mult: [ 1, 2, 4, 4 ] 31 | num_head_channels: 64 32 | use_spatial_transformer: True 33 | use_linear_in_transformer: True 34 | transformer_depth: 1 35 | context_dim: 1024 36 | use_checkpoint: True 37 | legacy: False 38 | use_fp16: True 39 | 40 | first_stage_config: 41 | target: ldm.models.autoencoder.AutoencoderKL 42 | params: 43 | embed_dim: 4 44 | monitor: val/rec_loss 45 | use_fp16: True 46 | ddconfig: 47 | double_z: true 48 | z_channels: 4 49 | resolution: 256 50 | in_channels: 3 51 | out_ch: 3 52 | ch: 128 53 | ch_mult: 54 | - 1 55 | - 2 56 | - 4 57 | - 4 58 | num_res_blocks: 2 59 | attn_resolutions: [] 60 | dropout: 0.0 61 | 62 | cond_stage_config: 63 | target: ldm.modules.encoders.modules.FrozenCLIPEmbedder_ZH 64 | params: 65 | use_fp16: True 66 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/configs/v2-train.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-05 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.00085 6 | linear_end: 0.0120 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: "image" 11 | cond_stage_key: "caption" 12 | image_size: 64 13 | channels: 4 14 | conditioning_key: crossattn 15 | monitor: val/loss_simple_ema 16 | scale_factor: 0.18215 17 | use_ema: False 18 | use_fp16: True 19 | 20 | unet_config: 21 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 22 | params: 23 | image_size: 32 # unused 24 | in_channels: 4 25 | out_channels: 4 26 | model_channels: 320 27 | attention_resolutions: [ 4, 2, 1 ] 28 | num_res_blocks: 2 29 | channel_mult: [ 1, 2, 4, 4 ] 30 | num_heads: 8 31 | use_spatial_transformer: True 32 | transformer_depth: 1 33 | context_dim: 768 34 | use_checkpoint: True 35 | legacy: False 36 | use_fp16: True 37 | dropout: 0.1 38 | 39 | first_stage_config: 40 | target: ldm.models.autoencoder.AutoencoderKL 41 | params: 42 | embed_dim: 4 43 | monitor: val/rec_loss 44 | use_fp16: True 45 | ddconfig: 46 | double_z: true 47 | z_channels: 4 48 | resolution: 256 49 | in_channels: 3 50 | out_ch: 3 51 | ch: 128 52 | ch_mult: 53 | - 1 54 | - 2 55 | - 4 56 | - 4 57 | num_res_blocks: 2 58 | attn_resolutions: [] 59 | 60 | cond_stage_config: 61 | target: ldm.modules.encoders.modules.FrozenCLIPEmbedder_ZH 62 | params: 63 | use_fp16: True 64 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/demo/city1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/demo/city1.png -------------------------------------------------------------------------------- /vision/stablediffusionv2/demo/city2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/demo/city2.png -------------------------------------------------------------------------------- /vision/stablediffusionv2/demo/horse1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/demo/horse1.png -------------------------------------------------------------------------------- /vision/stablediffusionv2/demo/horse2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/demo/horse2.png -------------------------------------------------------------------------------- /vision/stablediffusionv2/demo/sunflower1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/demo/sunflower1.png -------------------------------------------------------------------------------- /vision/stablediffusionv2/demo/sunflower2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/demo/sunflower2.png -------------------------------------------------------------------------------- /vision/stablediffusionv2/demo/tree1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/demo/tree1.png -------------------------------------------------------------------------------- /vision/stablediffusionv2/demo/tree2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/demo/tree2.png -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/ldm/data/__init__.py -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/data/t2i_collate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | 17 | from toolz.sandbox import unzip 18 | 19 | 20 | data_column = [ 21 | 'img_feat', 22 | 'txt_tokens' 23 | ] 24 | 25 | 26 | def t2i_collate(inputs): 27 | """ 28 | Return: 29 | :img_feat (batch_size, height, weight, 3) 30 | :txt_tokens (n, max_txt_len) 31 | """ 32 | img_feat, txt_tokens = map(list, unzip(inputs)) 33 | batch = { 34 | 'img_feat': img_feat, 35 | 'txt_tokens': txt_tokens, 36 | } 37 | return batch -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/models/autoencoder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | import mindspore as ms 16 | import mindspore.nn as nn 17 | import mindspore.ops as P 18 | 19 | from ldm.modules.diffusionmodules.model import Encoder, Decoder 20 | from ldm.modules.distributions.distributions import DiagonalGaussianDistribution 21 | 22 | class AutoencoderKL(nn.Cell): 23 | def __init__(self, 24 | ddconfig, 25 | embed_dim, 26 | ckpt_path=None, 27 | ignore_keys=[], 28 | image_key="image", 29 | colorize_nlabels=None, 30 | monitor=None, 31 | use_fp16=False 32 | ): 33 | super().__init__() 34 | self.dtype = ms.float16 if use_fp16 else ms.float32 35 | self.image_key = image_key 36 | self.encoder = Encoder(dtype=self.dtype, **ddconfig) 37 | self.decoder = Decoder(dtype=self.dtype, **ddconfig) 38 | assert ddconfig["double_z"] 39 | self.quant_conv = nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1, pad_mode="valid", has_bias=True).to_float(self.dtype) 40 | self.post_quant_conv = nn.Conv2d(embed_dim, ddconfig["z_channels"], 1, pad_mode="valid", has_bias=True).to_float(self.dtype) 41 | self.embed_dim = embed_dim 42 | if colorize_nlabels is not None: 43 | assert type(colorize_nlabels)==int 44 | self.register_buffer("colorize", ms.ops.standard_normal(3, colorize_nlabels, 1, 1)) 45 | if monitor is not None: 46 | self.monitor = monitor 47 | if ckpt_path is not None: 48 | self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) 49 | 50 | self.split = P.Split(axis=1, output_num=2) 51 | self.exp = P.Exp() 52 | self.stdnormal = P.StandardNormal() 53 | 54 | def init_from_ckpt(self, path, ignore_keys=list()): 55 | sd = ms.load_checkpoint(path)["state_dict"] 56 | keys = list(sd.keys()) 57 | for k in keys: 58 | for ik in ignore_keys: 59 | if k.startswith(ik): 60 | print("Deleting key {} from state_dict.".format(k)) 61 | del sd[k] 62 | ms.load_param_into_net(self, sd, strict_load=False) 63 | print(f"Restored from {path}") 64 | 65 | def decode(self, z): 66 | z = self.post_quant_conv(z) 67 | dec = self.decoder(z) 68 | return dec 69 | 70 | def encode(self, x): 71 | h = self.encoder(x) 72 | moments = self.quant_conv(h) 73 | mean, logvar = self.split(moments) 74 | logvar = P.clip_by_value(logvar, -30.0, 20.0) 75 | std = self.exp(0.5 * logvar) 76 | x = mean + std * self.stdnormal(mean.shape) 77 | return x 78 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/models/clip_zh/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/ldm/models/clip_zh/__init__.py -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/models/clip_zh/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/ldm/models/clip_zh/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/ldm/models/diffusion/__init__.py -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/models/diffusion/dpm_solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .sampler import DPMSolverSampler -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/models/diffusion/dpm_solver/sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """SAMPLING ONLY.""" 16 | 17 | import mindspore as ms 18 | from mindspore import ops 19 | from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver 20 | 21 | 22 | class DPMSolverSampler(object): 23 | def __init__(self, model, **kwargs): 24 | super().__init__() 25 | self.model = model 26 | self.register_buffer('alphas_cumprod', model.alphas_cumprod) 27 | 28 | def register_buffer(self, name, attr): 29 | setattr(self, name, attr) 30 | 31 | def sample(self, 32 | S, 33 | batch_size, 34 | shape, 35 | conditioning=None, 36 | callback=None, 37 | normals_sequence=None, 38 | img_callback=None, 39 | quantize_x0=False, 40 | eta=0., 41 | mask=None, 42 | x0=None, 43 | temperature=1., 44 | noise_dropout=0., 45 | score_corrector=None, 46 | corrector_kwargs=None, 47 | verbose=True, 48 | x_T=None, 49 | log_every_t=100, 50 | unconditional_guidance_scale=1., 51 | unconditional_conditioning=None, 52 | # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... 53 | **kwargs 54 | ): 55 | if conditioning is not None: 56 | if isinstance(conditioning, dict): 57 | cbs = conditioning[list(conditioning.keys())[0]].shape[0] 58 | if cbs != batch_size: 59 | print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}") 60 | else: 61 | if conditioning.shape[0] != batch_size: 62 | print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}") 63 | 64 | # sampling 65 | C, H, W = shape 66 | size = (batch_size, C, H, W) 67 | 68 | # print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}') 69 | 70 | if x_T is None: 71 | img = ops.standard_normal(size) 72 | else: 73 | img = x_T 74 | 75 | ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod) 76 | 77 | model_fn = model_wrapper( 78 | lambda x, t, c: self.model.apply_model(x, t, c), 79 | ns, 80 | model_type="noise", 81 | guidance_type="classifier-free", 82 | condition=conditioning, 83 | unconditional_condition=unconditional_conditioning, 84 | guidance_scale=unconditional_guidance_scale, 85 | ) 86 | 87 | dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False) 88 | 89 | x = dpm_solver.sample(ops.Cast()(img, ms.float16), steps=S, skip_type="time_uniform", 90 | method="multistep", order=2, lower_order_final=True) 91 | 92 | return x, None -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/ldm/modules/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/modules/diffusionmodules/upscaling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/ldm/modules/diffusionmodules/upscaling.py -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/ldm/modules/distributions/__init__.py -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/modules/distributions/distributions.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | import mindspore as ms 16 | import mindspore.ops as ops 17 | 18 | class DiagonalGaussianDistribution(object): 19 | def __init__(self, parameters, deterministic=False): 20 | 21 | self.mean, self.logvar = ops.Split(axis=1, output_num=2)(parameters) 22 | self.logvar = ops.clip_by_value(self.logvar, -30.0, 20.0) 23 | self.deterministic = deterministic 24 | self.std = ops.exp(0.5 * self.logvar) 25 | self.stdnormal = ops.StandardNormal() 26 | 27 | def sample(self): 28 | x = self.mean + self.std * self.stdnormal(self.mean.shape) 29 | return x -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/stablediffusionv2/ldm/modules/encoders/__init__.py -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/modules/encoders/modules.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | import mindspore as ms 16 | import mindspore.nn as nn 17 | import mindspore.ops as ops 18 | from mindspore import Tensor 19 | from ldm.models.clip_zh.simple_tokenizer import tokenize 20 | from .text_encoder import TextEncoder 21 | 22 | 23 | class FrozenCLIPEmbedder_ZH(nn.Cell): 24 | def __init__(self, max_length=77, use_fp16=False): 25 | super(FrozenCLIPEmbedder_ZH, self).__init__() 26 | self.dtype = ms.float16 if use_fp16 else ms.float32 27 | self.max_length = max_length 28 | self.tokenizer = tokenize 29 | self.transformer = TextEncoder(context_length=77, vocab_size=49408, output_dim=1024, width=1024, layers=23, heads=16, dtype=self.dtype) 30 | 31 | def tokenize(self, texts): 32 | return self.tokenizer(texts) 33 | 34 | def encode(self, text): 35 | batch_encoding = self.tokenize(text) 36 | outputs = self.transformer(batch_encoding) 37 | return outputs 38 | 39 | def construct(self, c): 40 | outputs = self.transformer(c) 41 | return outputs 42 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/modules/train/callback.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import mindspore as ms 17 | 18 | 19 | class OverflowMonitor(ms.Callback): 20 | def step_end(self, run_context): 21 | cb_params = run_context.original_args() 22 | cur_epoch_num = cb_params.get("cur_epoch_num", 1) 23 | cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 24 | overflow = cb_params.net_outputs[1] 25 | if overflow: 26 | print(f"overflow detected in epoch {cur_epoch_num} step {cur_step_in_epoch}") 27 | return super().step_end(run_context) -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/modules/train/learningrate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ 16 | Utils function for the parallel training. 17 | This is an experimental interface that is subject to change and/or deletion. 18 | """ 19 | 20 | from mindspore.ops import operations as P 21 | import mindspore.common.dtype as mstype 22 | from mindspore.common.tensor import Tensor 23 | from mindspore.nn.learning_rate_schedule import LearningRateSchedule, PolynomialDecayLR, WarmUpLR, CosineDecayLR 24 | import numpy as np 25 | 26 | 27 | class LearningRate(LearningRateSchedule): 28 | """ 29 | Learning_rate sheduler 30 | """ 31 | 32 | def __init__(self, 33 | start_learning_rate, 34 | end_learning_rate, 35 | warmup_steps, 36 | decay_steps, 37 | power=1.0, 38 | use_cosine=True): 39 | super(LearningRate, self).__init__() 40 | self.warmup_flag = False 41 | if warmup_steps > 0: 42 | self.warmup_flag = True 43 | self.warmup_lr = WarmUpLR(start_learning_rate, warmup_steps) 44 | self.decay_lr = PolynomialDecayLR(start_learning_rate, end_learning_rate, decay_steps, power) 45 | self.cosine_decay_lr = CosineDecayLR(end_learning_rate, start_learning_rate, decay_steps) 46 | self.warmup_steps = Tensor(np.array([warmup_steps]).astype(np.float32)) 47 | self.greater = P.Greater() 48 | self.one = Tensor(np.array([1.0]).astype(np.float32)) 49 | self.cast = P.Cast() 50 | self.use_cosine = use_cosine 51 | 52 | def construct(self, global_step): 53 | """Learning_rate sheduler construct""" 54 | if not self.use_cosine: 55 | decay_lr = self.decay_lr(global_step) 56 | else: 57 | decay_lr = self.cosine_decay_lr(global_step) 58 | if self.warmup_flag: 59 | is_warmup = self.cast(self.greater(self.warmup_steps, global_step), mstype.float32) 60 | warmup_lr = self.warmup_lr(global_step) 61 | lr = (self.one - is_warmup) * decay_lr + is_warmup * warmup_lr 62 | else: 63 | lr = decay_lr 64 | return lr 65 | 66 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/modules/train/optim.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ 16 | build optimizer for ms 17 | """ 18 | from mindspore.nn.optim.adam import Adam, AdamWeightDecay 19 | 20 | 21 | def build_optimizer(model, opts, lr): 22 | """ 23 | 24 | :param model: 25 | :param opts: 26 | :param lr: 27 | :return: optimizer 28 | """ 29 | 30 | decay_filter = lambda x: 'layernorm' not in x.name.lower() and "bias" not in x.name.lower() 31 | param_optimizer = model.trainable_params() 32 | decay_params = list(filter(decay_filter, param_optimizer)) 33 | other_params = list(filter(lambda x: not decay_filter(x), param_optimizer)) 34 | group_params = [{ 35 | 'params': decay_params, 36 | 'weight_decay': 1e-6 37 | }, { 38 | 'params': other_params, 39 | 'weight_decay': 0.0 40 | }, { 41 | 'order_params': param_optimizer 42 | }] 43 | if opts.optim == 'adam': 44 | OptimCls = Adam 45 | elif opts.optim == 'adamw': 46 | OptimCls = AdamWeightDecay 47 | else: 48 | raise ValueError('invalid optimizer') 49 | optimizer = OptimCls(group_params, 50 | learning_rate=lr, beta1=opts.betas[0], beta2=opts.betas[1]) 51 | return optimizer 52 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/modules/train/parallel_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """Transformer Networks""" 16 | 17 | import math 18 | 19 | import numpy as np 20 | import mindspore.common.dtype as mstype 21 | from mindspore.context import ParallelMode 22 | 23 | class ParallelConfig: 24 | r""" 25 | ParallelConfig for the setting the global data parallel, model parallel and fusion group. 26 | """ 27 | dp = 8 28 | mp = 1 29 | pipeline_stage = 1 30 | recompute = False 31 | optimizer_shard = False 32 | fusion_group = 1 33 | parallel_mode = ParallelMode.SEMI_AUTO_PARALLEL 34 | vocab_emb_dp = False 35 | ep = dp 36 | capacity_factor = 1.5 37 | expert_num = 32 38 | aux_loss_factor = 0.01 39 | 40 | @staticmethod 41 | def set_global_parallel_config(dp=1, 42 | mp=1, 43 | recompute=True, 44 | stages=1, 45 | optimizer_shard=True, 46 | fusion_group=4, 47 | parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, 48 | vocab_emb_dp=True): 49 | r""" 50 | The parallel configure setting 51 | 52 | Args: 53 | dp (int): The data parallel way. Default: 1 54 | mp (int): The model parallel way. Default: 1 55 | stages (int): The number of the pipeline stage. Should be a positive value. Default: 1. 56 | optimizer_shard (bool): Enable optimizer state sharding or not. Default: True. 57 | fusion_group (int): The fusion group size of the optimizer state sharding. Default: 4. 58 | recompute (bool): Enable recomputation of the transformer block or not. Default: False. 59 | parallel_mode (ParallelMode): Can be SEMI_AUTO_PARALLEL, DATA_AUTO_PARALLEL or AUTO_PARALLEL. 60 | vocab_emb_dp (bool): Shard embedding in model parallel or data parallel. Default: True 61 | 62 | Supported Platforms: 63 | ``Ascend`` ``GPU`` 64 | 65 | Examples: 66 | >>> ParallelConfig(dp=1, mp=1) 67 | >>> ParallelConfig(stages=4) 68 | """ 69 | ParallelConfig.dp = dp 70 | ParallelConfig.mp = mp 71 | ParallelConfig.pipeline_stage = stages 72 | ParallelConfig.optimizer_shard = optimizer_shard 73 | ParallelConfig.fusion_group = fusion_group 74 | ParallelConfig.recompute = recompute 75 | ParallelConfig.parallel_mode = parallel_mode 76 | ParallelConfig.vocab_emb_dp = vocab_emb_dp 77 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/modules/train/tools.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ 16 | Copyright (c) Microsoft Corporation. 17 | Licensed under the MIT license. 18 | 19 | Misc utilities 20 | """ 21 | import json 22 | import os 23 | import sys 24 | import random 25 | import numpy as np 26 | import mindspore as ms 27 | 28 | class NoOp: 29 | """ useful for distributed training No-Ops """ 30 | 31 | def __getattr__(self, name): 32 | return self.noop 33 | 34 | def noop(self, *args, **kwargs): 35 | return 36 | 37 | 38 | def parse_with_config(args): 39 | """Parse With Config""" 40 | if args.train_config is not None: 41 | abs_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../..")) 42 | args.train_config = os.path.join(abs_path, args.train_config) 43 | config_args = json.load(open(args.train_config)) 44 | override_keys = {arg[2:].split('=')[0] for arg in sys.argv[1:] 45 | if arg.startswith('--')} 46 | for k, v in config_args.items(): 47 | if k not in override_keys: 48 | setattr(args, k, v) 49 | return args 50 | 51 | 52 | def set_random_seed(seed): 53 | """Set Random Seed""" 54 | print("random seed: ", seed) 55 | random.seed(seed) 56 | np.random.seed(seed) 57 | ms.set_seed(seed) 58 | 59 | class Struct: 60 | def __init__(self, dict_): 61 | self.__dict__.update(dict_) 62 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/ldm/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | import importlib 16 | from inspect import isfunction 17 | import mindspore.ops as ops 18 | 19 | 20 | def exists(x): 21 | return x is not None 22 | 23 | 24 | def default(val, d): 25 | if exists(val): 26 | return val 27 | return d() if isfunction(d) else d 28 | 29 | 30 | def count_params(model, verbose=False): 31 | total_params = sum(p.numel() for p in model.parameters()) 32 | if verbose: 33 | print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.") 34 | return total_params 35 | 36 | 37 | def instantiate_from_config(config): 38 | if not "target" in config: 39 | if config == '__is_first_stage__': 40 | return None 41 | elif config == "__is_unconditional__": 42 | return None 43 | raise KeyError("Expected key `target` to instantiate.") 44 | return get_obj_from_str(config["target"])(**config.get("params", dict())) 45 | 46 | 47 | def get_obj_from_str(string, reload=False): 48 | module, cls = string.rsplit(".", 1) 49 | if reload: 50 | module_imp = importlib.import_module(module) 51 | importlib.reload(module_imp) 52 | return getattr(importlib.import_module(module, package=None), cls) 53 | 54 | def extract_into_tensor(a, t, x_shape): 55 | b = t.shape[0] 56 | out = ops.GatherD()(a, -1, t) 57 | return out.reshape(b, *((1,) * (len(x_shape) - 1))) 58 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | omegaconf 3 | einops 4 | ftfy 5 | regex 6 | albumentations 7 | pandas 8 | imagesize 9 | toolz 10 | pillow 11 | -------------------------------------------------------------------------------- /vision/stablediffusionv2/scripts/infer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2022 Huawei Technologies Co., Ltd 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # --prompt "A Van Gogh style oil painting of sunflower" \ 16 | # --prompt "a professional photograph of an astronaut riding a horse" \ 17 | # --prompt "The beautiful night view of the city has various buildings, traffic flow, and lights." \ 18 | # ============================================================================ 19 | 20 | export GLOG_v=3 21 | export ASCEND_GLOBAL_LOG_LEVEL=3 22 | export ASCEND_SLOG_PRINT_TO_STDOUT=0 23 | export DEVICE_ID=0 24 | 25 | python txt2img.py \ 26 | --prompt "Modernist style, sunset, withered vines, old trees, and mountains" \ 27 | --config configs/v2-inference.yaml \ 28 | --output_path ./output/ \ 29 | --seed 42 \ 30 | --n_iter 4 \ 31 | --n_samples 8 \ 32 | --W 512 \ 33 | --H 512 \ 34 | --ddim_steps 50 \ 35 | -------------------------------------------------------------------------------- /vision/wukong-huahua/README_EN.md: -------------------------------------------------------------------------------- 1 | # Wukong-Huahua 2 | ## Contents 3 | 4 | [查看中文](./README.md) 5 | 6 | - [Wukong-Huahua Model](#wukong-huahua-model) 7 | - [Environment Requirements](#environment-requirements) 8 | - [Quick Start](#quick-start) 9 | - [Prepare Checkpoint](#prepare-checkpoint) 10 | - [Text to Image Generation](#text-to-image-generation) 11 | - [Fine-tuning](#fine-tuning) 12 | - [Demos](#demos) 13 | 14 | ## Wukong-Huahua Model 15 | 16 | Wukong-Huahua is a diffusion-based model that perfoms text-to-image task in Chinese, which was developed by the **Huawei Noah's Ark Lab** in cooperation with the **Distributed & Parallel Software Lab** and **Ascend Product Develop Unit**. It was trained on [Wukong dataset](https://wukong-dataset.github.io/wukong-dataset/) and used [MindSpore](https://www.mindspore.cn/en) + Ascend, a software and hardware solution to implement. Welcome to try Wukong-Huahua by [Our Online Platform](https://xihe.mindspore.cn/modelzoo/wukong). 17 | 18 | ## Environment Requirements 19 | 20 | 1. **Ascend** Software + Hardware Solution (Driver + Firmware + CANN) 21 | 22 | Go to [Ascend website](). Follow the instructions to download and install. 23 | 2. AI Framework - **Mindspore** == 1.9 24 | 25 | Go to [MindSpore website](https://www.mindspore.cn/en "MindSpore") 1.9. Follow the instructions to install. 26 | 27 | If you need more help of MindSpore, please check 28 | - [MindSpore Tutorial](https://www.mindspore.cn/tutorials/en/master/index.html) 29 | - [MindSpore Python API](https://www.mindspore.cn/docs/en/master/index.htmll) 30 | 31 | 3. Third party dependency 32 | ```python 33 | pip install -r requirements.txt 34 | ``` 35 | 36 | 37 | ## Quick Start 38 | 39 | ### Prepare Checkpoint 40 | 41 | Download Wukong-Huahua pretrained checkpoint [wukong-huahua-ms.ckpt](https://download.mindspore.cn/toolkits/minddiffusion/wukong-huahua/wukong-huahua-ms.ckpt) and place it under wukong-huahua/models/ folder. 42 | 43 | For fine tune task , we provide example datasets to show the format, please download [here](https://opt-release.obs.cn-central-221.ovaijisuan.com/wukonghuahua/dataset.tar.gz). 44 | 45 | ### Text to Image Generation 46 | 47 | To generate images according to input text, run txt2img.py or simply run infer.sh with default argumemts. 48 | 49 | ```shell 50 | python txt2img.py --prompt [input text] --ckpt_path [ckpt_path] --ckpt_name [ckpt_name] \ 51 | --H [image_height] --W [image_width] --output_path [image save folder] \ 52 | --n_samples [number of images to generate] 53 | ``` 54 | or 55 | ```shell 56 | bash scripts/infer.sh 57 | ``` 58 | 59 | Generating higher resolution requires more memory. For Ascend 910 chip, we can generate 2 1024x768 images or 16 512 x 512 images at same time. 60 | 61 | ### Fine-tuning 62 | 63 | - Single card fine-tune: 64 | 65 | modify the related configs in scripts/run_train.sh 66 | 67 | ``` 68 | bash scripts/run_train.sh 69 | ``` 70 | 71 | - Multi-card fine-tune: 72 | 73 | modify the related configs in scripts/run_train_parallel.sh 74 | 75 | ``` 76 | bash scripts/run_train_parallel.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] 77 | ``` 78 | 79 | ### Demos 80 | 81 | Below are some of the images generated by our wukong-huahua model and corresponding `[input text]` 82 | 83 | ``` 84 | 城市夜景 赛博朋克 格雷格·鲁特科夫斯基 85 | ``` 86 | 87 | ![城市夜景 赛博朋克 格雷格·鲁特科夫斯基](demo/城市夜景%20赛博朋克%20格雷格·鲁特科夫斯基.png) 88 | 89 | ``` 90 | 莫奈 撑阳伞的女人 月亮 梦幻 91 | ``` 92 | 93 | ![莫奈 撑阳伞的女人 月亮 梦幻](demo/莫奈%20撑阳伞的女人%20月亮%20梦幻.png) 94 | 95 | ``` 96 | 海上日出时候的奔跑者 97 | ``` 98 | 99 | ![海上日出时候的奔跑者](demo/海上日出时候的奔跑者.png) 100 | 101 | ``` 102 | 诺亚方舟在世界末日起航 科幻插画 103 | ``` 104 | 105 | ![诺亚方舟在世界末日起航 科幻插画](demo/诺亚方舟在世界末日起航%20科幻插画.png) 106 | 107 | ``` 108 | 时空 黑洞 辐射 109 | ``` 110 | 111 | ![时空 黑洞 辐射](demo/时空%20黑洞%20辐射.png) 112 | 113 | ``` 114 | 乡村 田野 屏保 115 | ``` 116 | 117 | ![乡村 田野 屏保](demo/乡村%20田野%20屏保.png) 118 | 119 | ``` 120 | 来自深渊 风景 绘画 写实风格 121 | ``` 122 | 123 | ![来自深渊 风景 绘画 写实风格](demo/来自深渊%20风景%20绘画%20写实风格.png) 124 | -------------------------------------------------------------------------------- /vision/wukong-huahua/configs/clip-vit-l-14-zh/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "clip-vit-large-patch14/", 3 | "architectures": [ 4 | "CLIPModel" 5 | ], 6 | "initializer_factor": 1.0, 7 | "logit_scale_init_value": 2.6592, 8 | "model_type": "clip", 9 | "projection_dim": 768, 10 | "text_config": { 11 | "_name_or_path": "", 12 | "add_cross_attention": false, 13 | "architectures": null, 14 | "attention_dropout": 0.0, 15 | "bad_words_ids": null, 16 | "bos_token_id": 0, 17 | "chunk_size_feed_forward": 0, 18 | "cross_attention_hidden_size": null, 19 | "decoder_start_token_id": null, 20 | "diversity_penalty": 0.0, 21 | "do_sample": false, 22 | "dropout": 0.0, 23 | "early_stopping": false, 24 | "encoder_no_repeat_ngram_size": 0, 25 | "eos_token_id": 2, 26 | "finetuning_task": null, 27 | "forced_bos_token_id": null, 28 | "forced_eos_token_id": null, 29 | "hidden_act": "quick_gelu", 30 | "hidden_size": 768, 31 | "id2label": { 32 | "0": "LABEL_0", 33 | "1": "LABEL_1" 34 | }, 35 | "initializer_factor": 1.0, 36 | "initializer_range": 0.02, 37 | "intermediate_size": 3072, 38 | "is_decoder": false, 39 | "is_encoder_decoder": false, 40 | "label2id": { 41 | "LABEL_0": 0, 42 | "LABEL_1": 1 43 | }, 44 | "layer_norm_eps": 1e-05, 45 | "length_penalty": 1.0, 46 | "max_length": 20, 47 | "max_position_embeddings": 77, 48 | "min_length": 0, 49 | "model_type": "clip_text_model", 50 | "no_repeat_ngram_size": 0, 51 | "num_attention_heads": 12, 52 | "num_beam_groups": 1, 53 | "num_beams": 1, 54 | "num_hidden_layers": 12, 55 | "num_return_sequences": 1, 56 | "output_attentions": false, 57 | "output_hidden_states": false, 58 | "output_scores": false, 59 | "pad_token_id": 1, 60 | "prefix": null, 61 | "problem_type": null, 62 | "projection_dim" : 768, 63 | "pruned_heads": {}, 64 | "remove_invalid_values": false, 65 | "repetition_penalty": 1.0, 66 | "return_dict": true, 67 | "return_dict_in_generate": false, 68 | "sep_token_id": null, 69 | "task_specific_params": null, 70 | "temperature": 1.0, 71 | "tie_encoder_decoder": false, 72 | "tie_word_embeddings": true, 73 | "tokenizer_class": null, 74 | "top_k": 50, 75 | "top_p": 1.0, 76 | "torch_dtype": null, 77 | "torchscript": false, 78 | "transformers_version": "4.16.0.dev0", 79 | "use_bfloat16": false, 80 | "vocab_size": 49408 81 | }, 82 | "text_config_dict": { 83 | "hidden_size": 768, 84 | "intermediate_size": 3072, 85 | "num_attention_heads": 12, 86 | "num_hidden_layers": 12, 87 | "projection_dim": 768 88 | }, 89 | "torch_dtype": "float32", 90 | "transformers_version": null, 91 | "vision_config": { 92 | "_name_or_path": "", 93 | "add_cross_attention": false, 94 | "architectures": null, 95 | "attention_dropout": 0.0, 96 | "bad_words_ids": null, 97 | "bos_token_id": null, 98 | "chunk_size_feed_forward": 0, 99 | "cross_attention_hidden_size": null, 100 | "decoder_start_token_id": null, 101 | "diversity_penalty": 0.0, 102 | "do_sample": false, 103 | "dropout": 0.0, 104 | "early_stopping": false, 105 | "encoder_no_repeat_ngram_size": 0, 106 | "eos_token_id": null, 107 | "finetuning_task": null, 108 | "forced_bos_token_id": null, 109 | "forced_eos_token_id": null, 110 | "hidden_act": "quick_gelu", 111 | "hidden_size": 1024, 112 | "id2label": { 113 | "0": "LABEL_0", 114 | "1": "LABEL_1" 115 | }, 116 | "image_size": 224, 117 | "initializer_factor": 1.0, 118 | "initializer_range": 0.02, 119 | "intermediate_size": 4096, 120 | "is_decoder": false, 121 | "is_encoder_decoder": false, 122 | "label2id": { 123 | "LABEL_0": 0, 124 | "LABEL_1": 1 125 | }, 126 | "layer_norm_eps": 1e-05, 127 | "length_penalty": 1.0, 128 | "max_length": 20, 129 | "min_length": 0, 130 | "model_type": "clip_vision_model", 131 | "no_repeat_ngram_size": 0, 132 | "num_attention_heads": 16, 133 | "num_beam_groups": 1, 134 | "num_beams": 1, 135 | "num_hidden_layers": 24, 136 | "num_return_sequences": 1, 137 | "output_attentions": false, 138 | "output_hidden_states": false, 139 | "output_scores": false, 140 | "pad_token_id": null, 141 | "patch_size": 14, 142 | "prefix": null, 143 | "problem_type": null, 144 | "projection_dim" : 768, 145 | "pruned_heads": {}, 146 | "remove_invalid_values": false, 147 | "repetition_penalty": 1.0, 148 | "return_dict": true, 149 | "return_dict_in_generate": false, 150 | "sep_token_id": null, 151 | "task_specific_params": null, 152 | "temperature": 1.0, 153 | "tie_encoder_decoder": false, 154 | "tie_word_embeddings": true, 155 | "tokenizer_class": null, 156 | "top_k": 50, 157 | "top_p": 1.0, 158 | "torch_dtype": null, 159 | "torchscript": false, 160 | "transformers_version": "4.16.0.dev0", 161 | "use_bfloat16": false 162 | }, 163 | "vision_config_dict": { 164 | "hidden_size": 1024, 165 | "intermediate_size": 4096, 166 | "num_attention_heads": 16, 167 | "num_hidden_layers": 24, 168 | "patch_size": 14, 169 | "projection_dim": 768 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /vision/wukong-huahua/configs/train_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "configs/v1-train-chinese.yaml", 3 | "pretrained_model_path": "models/", 4 | "pretrained_model_file":"wukong-huahua-ms.ckpt", 5 | "data_path": "/secHome/FFHQ", 6 | "train_batch_size": 3, 7 | "gradient_accumulation_steps": 1, 8 | "optim": "adamw", 9 | "patch_size":32, 10 | "epochs": 20, 11 | "betas": [ 12 | 0.9, 13 | 0.98 14 | ], 15 | "dropout": 0.1, 16 | "weight_decay": 0.01, 17 | "warmup_steps": 1000, 18 | "seed": 3407, 19 | "image_size": 512, 20 | "image_filter_size": 256, 21 | "random_crop": false, 22 | "filter_small_size": true, 23 | "start_learning_rate": 1e-5, 24 | "end_learning_rate": 1e-7, 25 | "decay_steps": 0, 26 | "save_checkpoint_steps": 10000 27 | } 28 | -------------------------------------------------------------------------------- /vision/wukong-huahua/configs/train_db_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": "configs/v1-train-db-chinese.yaml", 3 | "pretrained_model_path": "models/", 4 | "pretrained_model_file":"wukong-huahua-ms.ckpt", 5 | "data_path": "/secHome/FFHQ", 6 | "train_data_path": "dataset/train_cat", 7 | "reg_data_path": "dataset/reg_cat", 8 | "train_data_repeats": 100, 9 | "class_word": "猫", 10 | "token": "α", 11 | "train_batch_size": 1, 12 | "gradient_accumulation_steps": 1, 13 | "optim": "adamw", 14 | "patch_size":32, 15 | "epochs": 5, 16 | "betas": [ 17 | 0.9, 18 | 0.98 19 | ], 20 | "dropout": 0.1, 21 | "weight_decay": 0.01, 22 | "warmup_steps": 100, 23 | "seed": 3407, 24 | "image_size": 512, 25 | "image_filter_size": 256, 26 | "random_crop": false, 27 | "filter_small_size": true, 28 | "start_learning_rate": 1e-6, 29 | "end_learning_rate": 1e-7, 30 | "decay_steps": 0, 31 | "save_checkpoint_steps": 1000 32 | } 33 | -------------------------------------------------------------------------------- /vision/wukong-huahua/configs/v1-inference-chinese-lora.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-04 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.00085 6 | linear_end: 0.0120 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: "jpg" 11 | cond_stage_key: "txt" 12 | image_size: 64 13 | channels: 4 14 | cond_stage_trainable: false # Note: different from the one we trained before 15 | conditioning_key: crossattn 16 | monitor: val/loss_simple_ema 17 | scale_factor: 0.18215 18 | use_ema: False 19 | use_fp16: True 20 | 21 | unet_config: 22 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 23 | params: 24 | image_size: 32 # unused 25 | in_channels: 4 26 | out_channels: 4 27 | model_channels: 320 28 | attention_resolutions: [ 4, 2, 1 ] 29 | num_res_blocks: 2 30 | channel_mult: [ 1, 2, 4, 4 ] 31 | num_heads: 8 32 | use_spatial_transformer: True 33 | transformer_depth: 1 34 | context_dim: 768 35 | use_checkpoint: True 36 | legacy: False 37 | use_fp16: True 38 | enable_lora: True 39 | lora_rank: 4 40 | lora_alpha: 4 41 | 42 | first_stage_config: 43 | target: ldm.models.autoencoder.AutoencoderKL 44 | params: 45 | embed_dim: 4 46 | monitor: val/rec_loss 47 | use_fp16: True 48 | ddconfig: 49 | double_z: true 50 | z_channels: 4 51 | resolution: 256 52 | in_channels: 3 53 | out_ch: 3 54 | ch: 128 55 | ch_mult: 56 | - 1 57 | - 2 58 | - 4 59 | - 4 60 | num_res_blocks: 2 61 | attn_resolutions: [] 62 | dropout: 0.0 63 | 64 | cond_stage_config: 65 | target: ldm.modules.encoders.modules.FrozenCLIPEmbedder_ZH 66 | params: 67 | use_fp16: True 68 | -------------------------------------------------------------------------------- /vision/wukong-huahua/configs/v1-inference-chinese.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-04 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.00085 6 | linear_end: 0.0120 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: "jpg" 11 | cond_stage_key: "txt" 12 | image_size: 64 13 | channels: 4 14 | cond_stage_trainable: false # Note: different from the one we trained before 15 | conditioning_key: crossattn 16 | monitor: val/loss_simple_ema 17 | scale_factor: 0.18215 18 | use_ema: False 19 | use_fp16: True 20 | 21 | unet_config: 22 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 23 | params: 24 | image_size: 32 # unused 25 | in_channels: 4 26 | out_channels: 4 27 | model_channels: 320 28 | attention_resolutions: [ 4, 2, 1 ] 29 | num_res_blocks: 2 30 | channel_mult: [ 1, 2, 4, 4 ] 31 | num_heads: 8 32 | use_spatial_transformer: True 33 | transformer_depth: 1 34 | context_dim: 768 35 | use_checkpoint: True 36 | legacy: False 37 | use_fp16: True 38 | 39 | first_stage_config: 40 | target: ldm.models.autoencoder.AutoencoderKL 41 | params: 42 | embed_dim: 4 43 | monitor: val/rec_loss 44 | use_fp16: True 45 | ddconfig: 46 | double_z: true 47 | z_channels: 4 48 | resolution: 256 49 | in_channels: 3 50 | out_ch: 3 51 | ch: 128 52 | ch_mult: 53 | - 1 54 | - 2 55 | - 4 56 | - 4 57 | num_res_blocks: 2 58 | attn_resolutions: [] 59 | dropout: 0.0 60 | 61 | cond_stage_config: 62 | target: ldm.modules.encoders.modules.FrozenCLIPEmbedder_ZH 63 | params: 64 | use_fp16: True 65 | -------------------------------------------------------------------------------- /vision/wukong-huahua/configs/v1-train-chinese-lora.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-05 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.00085 6 | linear_end: 0.0120 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: "image" 11 | cond_stage_key: "caption" 12 | image_size: 64 13 | channels: 4 14 | conditioning_key: crossattn 15 | monitor: val/loss_simple_ema 16 | scale_factor: 0.18215 17 | use_ema: False 18 | use_fp16: True 19 | 20 | unet_config: 21 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 22 | params: 23 | image_size: 32 # unused 24 | in_channels: 4 25 | out_channels: 4 26 | model_channels: 320 27 | attention_resolutions: [ 4, 2, 1 ] 28 | num_res_blocks: 2 29 | channel_mult: [ 1, 2, 4, 4 ] 30 | num_heads: 8 31 | use_spatial_transformer: True 32 | transformer_depth: 1 33 | context_dim: 768 34 | use_checkpoint: True 35 | legacy: False 36 | use_fp16: True 37 | dropout: 0.1 38 | enable_lora: True 39 | lora_rank: 4 40 | lora_alpha: 4 41 | 42 | first_stage_config: 43 | target: ldm.models.autoencoder.AutoencoderKL 44 | params: 45 | embed_dim: 4 46 | monitor: val/rec_loss 47 | use_fp16: True 48 | ddconfig: 49 | double_z: true 50 | z_channels: 4 51 | resolution: 256 52 | in_channels: 3 53 | out_ch: 3 54 | ch: 128 55 | ch_mult: 56 | - 1 57 | - 2 58 | - 4 59 | - 4 60 | num_res_blocks: 2 61 | attn_resolutions: [] 62 | 63 | cond_stage_config: 64 | target: ldm.modules.encoders.modules.FrozenCLIPEmbedder_ZH 65 | params: 66 | use_fp16: True 67 | -------------------------------------------------------------------------------- /vision/wukong-huahua/configs/v1-train-chinese.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-05 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.00085 6 | linear_end: 0.0120 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: "image" 11 | cond_stage_key: "caption" 12 | image_size: 64 13 | channels: 4 14 | conditioning_key: crossattn 15 | monitor: val/loss_simple_ema 16 | scale_factor: 0.18215 17 | use_ema: False 18 | use_fp16: True 19 | 20 | unet_config: 21 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 22 | params: 23 | image_size: 32 # unused 24 | in_channels: 4 25 | out_channels: 4 26 | model_channels: 320 27 | attention_resolutions: [ 4, 2, 1 ] 28 | num_res_blocks: 2 29 | channel_mult: [ 1, 2, 4, 4 ] 30 | num_heads: 8 31 | use_spatial_transformer: True 32 | transformer_depth: 1 33 | context_dim: 768 34 | use_checkpoint: True 35 | legacy: False 36 | use_fp16: True 37 | dropout: 0.1 38 | 39 | first_stage_config: 40 | target: ldm.models.autoencoder.AutoencoderKL 41 | params: 42 | embed_dim: 4 43 | monitor: val/rec_loss 44 | use_fp16: True 45 | ddconfig: 46 | double_z: true 47 | z_channels: 4 48 | resolution: 256 49 | in_channels: 3 50 | out_ch: 3 51 | ch: 128 52 | ch_mult: 53 | - 1 54 | - 2 55 | - 4 56 | - 4 57 | num_res_blocks: 2 58 | attn_resolutions: [] 59 | 60 | cond_stage_config: 61 | target: ldm.modules.encoders.modules.FrozenCLIPEmbedder_ZH 62 | params: 63 | use_fp16: True 64 | -------------------------------------------------------------------------------- /vision/wukong-huahua/configs/v1-train-db-chinese.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-05 3 | target: ldm.models.diffusion.ddpm.LatentDiffusionDB 4 | params: 5 | linear_start: 0.00085 6 | linear_end: 0.0120 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: "image" 11 | cond_stage_key: "caption" 12 | image_size: 64 13 | channels: 4 14 | conditioning_key: crossattn 15 | monitor: val/loss_simple_ema 16 | scale_factor: 0.18215 17 | use_ema: False 18 | use_fp16: True 19 | reg_weight: 0.5 20 | 21 | unet_config: 22 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 23 | params: 24 | image_size: 32 # unused 25 | in_channels: 4 26 | out_channels: 4 27 | model_channels: 320 28 | attention_resolutions: [ 4, 2, 1 ] 29 | num_res_blocks: 2 30 | channel_mult: [ 1, 2, 4, 4 ] 31 | num_heads: 8 32 | use_spatial_transformer: True 33 | transformer_depth: 1 34 | context_dim: 768 35 | use_checkpoint: True 36 | legacy: False 37 | use_fp16: True 38 | dropout: 0.1 39 | 40 | first_stage_config: 41 | target: ldm.models.autoencoder.AutoencoderKL 42 | params: 43 | embed_dim: 4 44 | monitor: val/rec_loss 45 | use_fp16: True 46 | ddconfig: 47 | double_z: true 48 | z_channels: 4 49 | resolution: 256 50 | in_channels: 3 51 | out_ch: 3 52 | ch: 128 53 | ch_mult: 54 | - 1 55 | - 2 56 | - 4 57 | - 4 58 | num_res_blocks: 2 59 | attn_resolutions: [] 60 | 61 | cond_stage_config: 62 | target: ldm.modules.encoders.modules.FrozenCLIPEmbedder_ZH 63 | params: 64 | use_fp16: True 65 | -------------------------------------------------------------------------------- /vision/wukong-huahua/configs/wukong-huahua_inpaint_inference.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion 3 | params: 4 | linear_start: 0.00085 5 | linear_end: 0.0120 6 | num_timesteps_cond: 1 7 | log_every_t: 200 8 | timesteps: 1000 9 | first_stage_key: "image" 10 | cond_stage_key: "caption" 11 | image_size: 64 12 | channels: 4 13 | cond_stage_trainable: false # Note: different from the one we trained before 14 | conditioning_key: hybrid # important 15 | monitor: val/loss_simple_ema 16 | scale_factor: 0.18215 17 | finetune_keys: null 18 | use_ema: false 19 | 20 | unet_config: 21 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 22 | params: 23 | image_size: 64 # unused 24 | in_channels: 9 # 4 data + 4 downscaled image + 1 mask 25 | out_channels: 4 26 | model_channels: 320 27 | attention_resolutions: [ 4, 2, 1 ] 28 | num_res_blocks: 2 29 | channel_mult: [ 1, 2, 4, 4 ] 30 | num_heads: 8 31 | use_spatial_transformer: true 32 | transformer_depth: 1 33 | context_dim: 768 34 | use_checkpoint: true 35 | legacy: false 36 | use_fp16: True 37 | 38 | first_stage_config: 39 | target: ldm.models.autoencoder.AutoencoderKL 40 | params: 41 | embed_dim: 4 42 | monitor: val/rec_loss 43 | use_fp16: True 44 | ddconfig: 45 | double_z: true 46 | z_channels: 4 47 | resolution: 512 48 | in_channels: 3 49 | out_ch: 3 50 | ch: 128 51 | ch_mult: 52 | - 1 53 | - 2 54 | - 4 55 | - 4 56 | num_res_blocks: 2 57 | attn_resolutions: [] 58 | dropout: 0.0 59 | 60 | cond_stage_config: 61 | target: ldm.modules.encoders.modules.FrozenCLIPEmbedder_ZH 62 | params: 63 | use_fp16: True 64 | -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/inpaint/overture-creations-5sI6fQgYIuo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/inpaint/overture-creations-5sI6fQgYIuo.png -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/inpaint/overture-creations-5sI6fQgYIuo_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/inpaint/overture-creations-5sI6fQgYIuo_mask.png -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/inpaint/一只红色的狐狸坐在长椅上.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/inpaint/一只红色的狐狸坐在长椅上.png -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/个性化生成效果-猫.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/个性化生成效果-猫.jpg -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/个性化训练数据-猫.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/个性化训练数据-猫.jpg -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/乡村 田野 屏保.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/乡村 田野 屏保.png -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/城市夜景 赛博朋克 格雷格·鲁特科夫斯基.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/城市夜景 赛博朋克 格雷格·鲁特科夫斯基.png -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/效果图合集.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/效果图合集.jpg -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/时空 黑洞 辐射.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/时空 黑洞 辐射.png -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/来自深渊 风景 绘画 写实风格.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/来自深渊 风景 绘画 写实风格.png -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/海上日出时候的奔跑者.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/海上日出时候的奔跑者.png -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/莫奈 撑阳伞的女人 月亮 梦幻.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/莫奈 撑阳伞的女人 月亮 梦幻.png -------------------------------------------------------------------------------- /vision/wukong-huahua/demo/诺亚方舟在世界末日起航 科幻插画.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/demo/诺亚方舟在世界末日起航 科幻插画.png -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/data/t2i_collate.py: -------------------------------------------------------------------------------- 1 | from toolz.sandbox import unzip 2 | 3 | 4 | data_column = [ 5 | 'img_feat', 6 | 'txt_tokens' 7 | ] 8 | 9 | 10 | def t2i_collate(inputs): 11 | """ 12 | Return: 13 | :img_feat (batch_size, height, weight, 3) 14 | :txt_tokens (n, max_txt_len) 15 | """ 16 | img_feat, txt_tokens = map(list, unzip(inputs)) 17 | batch = { 18 | 'img_feat': img_feat, 19 | 'txt_tokens': txt_tokens, 20 | } 21 | return batch 22 | 23 | 24 | data_column_db = [ 25 | 'train_img_feat', 26 | 'train_txt_tokens', 27 | 'reg_img_feat', 28 | 'reg_txt_tokens' 29 | ] 30 | 31 | 32 | def t2i_collate_db(inputs): 33 | """ 34 | Return: 35 | :train_img_feat (batch_size, height, weight, 3) 36 | :train_txt_tokens (n, max_txt_len) 37 | :reg_img_feat (batch_size, height, weight, 3) 38 | :reg_txt_tokens (n, max_txt_len) 39 | """ 40 | train_img_feat, train_txt_tokens, reg_img_feat, reg_txt_tokens= map(list, unzip(inputs)) 41 | batch = { 42 | 'train_img_feat': train_img_feat, 43 | 'train_txt_tokens': train_txt_tokens, 44 | 'reg_img_feat': reg_img_feat, 45 | 'reg_txt_tokens': reg_txt_tokens, 46 | } 47 | return batch -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/models/autoencoder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | import mindspore as ms 16 | import mindspore.nn as nn 17 | import mindspore.ops as P 18 | 19 | from ldm.modules.diffusionmodules.model import Encoder, Decoder 20 | from ldm.modules.distributions.distributions import DiagonalGaussianDistribution 21 | 22 | class AutoencoderKL(nn.Cell): 23 | def __init__(self, 24 | ddconfig, 25 | embed_dim, 26 | ckpt_path=None, 27 | ignore_keys=[], 28 | image_key="image", 29 | colorize_nlabels=None, 30 | monitor=None, 31 | use_fp16=False 32 | ): 33 | super().__init__() 34 | self.dtype = ms.float16 if use_fp16 else ms.float32 35 | self.image_key = image_key 36 | self.encoder = Encoder(dtype=self.dtype, **ddconfig) 37 | self.decoder = Decoder(dtype=self.dtype, **ddconfig) 38 | assert ddconfig["double_z"] 39 | self.quant_conv = nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1, pad_mode="valid", has_bias=True).to_float(self.dtype) 40 | self.post_quant_conv = nn.Conv2d(embed_dim, ddconfig["z_channels"], 1, pad_mode="valid", has_bias=True).to_float(self.dtype) 41 | self.embed_dim = embed_dim 42 | if colorize_nlabels is not None: 43 | assert type(colorize_nlabels)==int 44 | self.register_buffer("colorize", ms.ops.standard_normal(3, colorize_nlabels, 1, 1)) 45 | if monitor is not None: 46 | self.monitor = monitor 47 | if ckpt_path is not None: 48 | self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) 49 | 50 | self.split = P.Split(axis=1, output_num=2) 51 | self.exp = P.Exp() 52 | self.stdnormal = P.StandardNormal() 53 | 54 | def init_from_ckpt(self, path, ignore_keys=list()): 55 | sd = ms.load_checkpoint(path)["state_dict"] 56 | keys = list(sd.keys()) 57 | for k in keys: 58 | for ik in ignore_keys: 59 | if k.startswith(ik): 60 | print("Deleting key {} from state_dict.".format(k)) 61 | del sd[k] 62 | ms.load_param_into_net(self, sd, strict_load=False) 63 | print(f"Restored from {path}") 64 | 65 | def decode(self, z): 66 | z = self.post_quant_conv(z) 67 | dec = self.decoder(z) 68 | return dec 69 | 70 | def encode(self, x): 71 | h = self.encoder(x) 72 | moments = self.quant_conv(h) 73 | mean, logvar = self.split(moments) 74 | logvar = P.clip_by_value(logvar, -30.0, 20.0) 75 | std = self.exp(0.5 * logvar) 76 | x = mean + std * self.stdnormal(mean.shape) 77 | return x 78 | -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/models/clip_zh/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindspore-lab/minddiffusion/5779fd04c17f60f277fa88e635287fcc1dd4ecc5/vision/wukong-huahua/ldm/models/clip_zh/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/models/clip_zh/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | import os 16 | import unicodedata 17 | 18 | def abs_root_dir(cfg, data_root=None): 19 | def get_abs_path(data_dir, data_root): 20 | if os.path.isabs(data_dir): 21 | return data_dir 22 | return os.path.join(data_root, data_dir) 23 | 24 | if isinstance(cfg, dict): 25 | for key, value in cfg.items(): 26 | if key == 'root_dir': 27 | cfg[key] = get_abs_path(value, data_root) 28 | break 29 | abs_root_dir(value, data_root=data_root) 30 | elif isinstance(cfg, list): 31 | for item in cfg: 32 | abs_root_dir(item, data_root=data_root) 33 | else: 34 | return 35 | 36 | 37 | def is_control(char): 38 | """Checks whether `char` is a control character.""" 39 | # These are technically control characters but we count them as whitespace 40 | # characters. 41 | if char == "\t" or char == "\n" or char == "\r": 42 | return False 43 | cat = unicodedata.category(char) 44 | if cat.startswith("C"): 45 | return True 46 | return False 47 | 48 | 49 | def is_whitespace(char): 50 | """Checks whether `char` is a whitespace character.""" 51 | # \t, \n, and \r are technically control characters but we treat them 52 | # as whitespace since they are generally considered as such. 53 | if char == " " or char == "\t" or char == "\n" or char == "\r": 54 | return True 55 | cat = unicodedata.category(char) 56 | if cat == "Zs": 57 | return True 58 | return False 59 | 60 | 61 | def is_chinese_char(cp): 62 | """Checks whether CP is the codepoint of a CJK character.""" 63 | # This defines a "chinese character" as anything in the CJK Unicode block: 64 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 65 | # 66 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 67 | # despite its name. The modern Korean Hangul alphabet is a different block, 68 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 69 | # space-separated words, so they are not treated specially and handled 70 | # like the all of the other languages. 71 | if ( 72 | (0x4E00 <= cp <= 0x9FFF) 73 | or (0x3400 <= cp <= 0x4DBF) # 74 | or (0x20000 <= cp <= 0x2A6DF) # 75 | or (0x2A700 <= cp <= 0x2B73F) # 76 | or (0x2B740 <= cp <= 0x2B81F) # 77 | or (0x2B820 <= cp <= 0x2CEAF) # 78 | or (0xF900 <= cp <= 0xFAFF) 79 | or (0x2F800 <= cp <= 0x2FA1F) # 80 | ): # 81 | return True 82 | 83 | return False 84 | 85 | 86 | def is_punctuation(char): 87 | """Checks whether `char` is a punctuation character.""" 88 | cp = ord(char) 89 | # We treat all non-letter/number ASCII as punctuation. 90 | # Characters such as "^", "$", and "`" are not in the Unicode 91 | # Punctuation class but we treat them as punctuation anyways, for 92 | # consistency. 93 | if (33 <= cp <= 47) or (58 <= cp <= 64) \ 94 | or (91 <= cp <= 96) or (123 <= cp <= 126): 95 | return True 96 | cat = unicodedata.category(char) 97 | if cat.startswith("P"): 98 | return True 99 | return False 100 | 101 | 102 | def strip_accents(text): 103 | """Strips accents from a piece of text.""" 104 | text = unicodedata.normalize("NFD", text) 105 | output = [] 106 | for char in text: 107 | cat = unicodedata.category(char) 108 | if cat == "Mn": 109 | continue 110 | output.append(char) 111 | return "".join(output) 112 | -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/models/diffusion/dpm_solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .sampler import DPMSolverSampler -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/models/diffusion/dpm_solver/sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """SAMPLING ONLY.""" 16 | 17 | import mindspore as ms 18 | from mindspore import ops 19 | from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver 20 | 21 | 22 | class DPMSolverSampler(object): 23 | def __init__(self, model, **kwargs): 24 | super().__init__() 25 | self.model = model 26 | self.register_buffer('alphas_cumprod', model.alphas_cumprod) 27 | 28 | def register_buffer(self, name, attr): 29 | setattr(self, name, attr) 30 | 31 | def sample(self, 32 | S, 33 | batch_size, 34 | shape, 35 | conditioning=None, 36 | callback=None, 37 | normals_sequence=None, 38 | img_callback=None, 39 | quantize_x0=False, 40 | eta=0., 41 | mask=None, 42 | x0=None, 43 | temperature=1., 44 | noise_dropout=0., 45 | score_corrector=None, 46 | corrector_kwargs=None, 47 | verbose=True, 48 | x_T=None, 49 | log_every_t=100, 50 | unconditional_guidance_scale=1., 51 | unconditional_conditioning=None, 52 | # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... 53 | **kwargs 54 | ): 55 | if conditioning is not None: 56 | if isinstance(conditioning, dict): 57 | cbs = conditioning[list(conditioning.keys())[0]].shape[0] 58 | if cbs != batch_size: 59 | print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}") 60 | else: 61 | if conditioning.shape[0] != batch_size: 62 | print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}") 63 | 64 | # sampling 65 | C, H, W = shape 66 | size = (batch_size, C, H, W) 67 | 68 | # print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}') 69 | 70 | if x_T is None: 71 | img = ops.standard_normal(size) 72 | else: 73 | img = x_T 74 | 75 | ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod) 76 | model_fn = model_wrapper( 77 | lambda x, t, c: self.model.apply_model(x, t, 78 | c_concat=c if self.model.model.conditioning_key == 'concat' else None, 79 | c_crossattn=c if self.model.model.conditioning_key == 'crossattn' else None), 80 | ns, 81 | model_type="noise", 82 | guidance_type="classifier-free", 83 | condition=conditioning, 84 | unconditional_condition=unconditional_conditioning, 85 | guidance_scale=unconditional_guidance_scale, 86 | ) 87 | 88 | dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False) 89 | 90 | x = dpm_solver.sample(ops.Cast()(img, ms.float16), steps=S, skip_type="time_uniform", 91 | method="multistep", order=2, lower_order_final=True) 92 | 93 | return x, None -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/modules/distributions/distributions.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | import mindspore as ms 16 | import mindspore.ops as ops 17 | 18 | class DiagonalGaussianDistribution(object): 19 | def __init__(self, parameters, deterministic=False): 20 | 21 | self.mean, self.logvar = ops.Split(axis=1, output_num=2)(parameters) 22 | self.logvar = ops.clip_by_value(self.logvar, -30.0, 20.0) 23 | self.deterministic = deterministic 24 | self.std = ops.exp(0.5 * self.logvar) 25 | self.stdnormal = ops.StandardNormal() 26 | 27 | def sample(self): 28 | x = self.mean + self.std * self.stdnormal(self.mean.shape) 29 | return x -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/modules/encoders/modules.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | import mindspore as ms 16 | import mindspore.nn as nn 17 | import mindspore.ops as ops 18 | from mindspore import Tensor 19 | from ldm.models.clip_zh.simple_tokenizer import WordpieceTokenizer 20 | from .text_encoder import TextEncoder 21 | 22 | 23 | class FrozenCLIPEmbedder_ZH(nn.Cell): 24 | def __init__(self, max_length=77, use_fp16=False): 25 | super(FrozenCLIPEmbedder_ZH, self).__init__() 26 | self.dtype = ms.float16 if use_fp16 else ms.float32 27 | self.max_length = max_length 28 | self.tokenizer = WordpieceTokenizer() 29 | self.transformer = TextEncoder(context_length=77, vocab_size=49408, output_dim=768, width=768, layers=12, heads=12, dtype=self.dtype) 30 | 31 | def tokenize(self, texts): 32 | SOT_TEXT = "[CLS]" 33 | EOT_TEXT = "[SEP]" 34 | CONTEXT_LEN = 77 35 | 36 | if isinstance(texts, str): 37 | texts = [texts] 38 | 39 | sot_token = self.tokenizer.encoder[SOT_TEXT] 40 | eot_token = self.tokenizer.encoder[EOT_TEXT] 41 | all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token] for text in texts] 42 | result = ops.Zeros()((len(all_tokens), CONTEXT_LEN), ms.int64) 43 | 44 | for i, tokens in enumerate(all_tokens): 45 | if len(tokens) > CONTEXT_LEN: 46 | tokens = tokens[:CONTEXT_LEN - 1] + [eot_token] 47 | 48 | result[i, : len(tokens)] = Tensor(tokens) 49 | 50 | return result 51 | 52 | def encode(self, text): 53 | batch_encoding = self.tokenize(text) 54 | outputs = self.transformer(batch_encoding) 55 | return outputs 56 | 57 | def construct(self, c): 58 | outputs = self.transformer(c) 59 | return outputs 60 | -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/modules/train/callback.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | import mindspore as ms 17 | 18 | 19 | class OverflowMonitor(ms.Callback): 20 | def step_end(self, run_context): 21 | cb_params = run_context.original_args() 22 | cur_epoch_num = cb_params.get("cur_epoch_num", 1) 23 | cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 24 | overflow = cb_params.net_outputs[1] 25 | if overflow: 26 | print(f"overflow detected in epoch {cur_epoch_num} step {cur_step_in_epoch}") 27 | return super().step_end(run_context) -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/modules/train/learningrate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ 16 | Utils function for the parallel training. 17 | This is an experimental interface that is subject to change and/or deletion. 18 | """ 19 | 20 | from mindspore.ops import operations as P 21 | import mindspore.common.dtype as mstype 22 | from mindspore.common.tensor import Tensor 23 | from mindspore.nn.learning_rate_schedule import LearningRateSchedule, PolynomialDecayLR, WarmUpLR, CosineDecayLR 24 | import numpy as np 25 | 26 | 27 | class LearningRate(LearningRateSchedule): 28 | """ 29 | Learning_rate sheduler 30 | """ 31 | 32 | def __init__(self, 33 | start_learning_rate, 34 | end_learning_rate, 35 | warmup_steps, 36 | decay_steps, 37 | power=1.0, 38 | use_cosine=True): 39 | super(LearningRate, self).__init__() 40 | self.warmup_flag = False 41 | if warmup_steps > 0: 42 | self.warmup_flag = True 43 | self.warmup_lr = WarmUpLR(start_learning_rate, warmup_steps) 44 | self.decay_lr = PolynomialDecayLR(start_learning_rate, end_learning_rate, decay_steps, power) 45 | self.cosine_decay_lr = CosineDecayLR(end_learning_rate, start_learning_rate, decay_steps) 46 | self.warmup_steps = Tensor(np.array([warmup_steps]).astype(np.float32)) 47 | self.greater = P.Greater() 48 | self.one = Tensor(np.array([1.0]).astype(np.float32)) 49 | self.cast = P.Cast() 50 | self.use_cosine = use_cosine 51 | 52 | def construct(self, global_step): 53 | """Learning_rate sheduler construct""" 54 | if not self.use_cosine: 55 | decay_lr = self.decay_lr(global_step) 56 | else: 57 | decay_lr = self.cosine_decay_lr(global_step) 58 | if self.warmup_flag: 59 | is_warmup = self.cast(self.greater(self.warmup_steps, global_step), mstype.float32) 60 | warmup_lr = self.warmup_lr(global_step) 61 | lr = (self.one - is_warmup) * decay_lr + is_warmup * warmup_lr 62 | else: 63 | lr = decay_lr 64 | return lr 65 | 66 | -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/modules/train/optim.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ 16 | build optimizer for ms 17 | """ 18 | from mindspore.nn.optim.adam import Adam, AdamWeightDecay 19 | 20 | 21 | def build_optimizer(model, opts, lr, enable_lora=False): 22 | """ 23 | 24 | :param model: 25 | :param opts: 26 | :param lr: 27 | :return: optimizer 28 | """ 29 | 30 | decay_filter = lambda x: 'layernorm' not in x.name.lower() and "bias" not in x.name.lower() 31 | param_optimizer = model.trainable_params() 32 | decay_params = list(filter(decay_filter, param_optimizer)) 33 | other_params = list(filter(lambda x: not decay_filter(x), param_optimizer)) 34 | group_params = [{ 35 | 'params': decay_params, 36 | 'weight_decay': 1e-6 37 | }, { 38 | 'order_params': param_optimizer 39 | }] 40 | 41 | # 适配lora后,得到的other_params为空,因此无需加入到group_params中 42 | if not enable_lora: 43 | group_params.append({ 44 | 'params': other_params, 45 | 'weight_decay': 0.0 46 | }) 47 | 48 | if opts.optim == 'adam': 49 | OptimCls = Adam 50 | elif opts.optim == 'adamw': 51 | OptimCls = AdamWeightDecay 52 | else: 53 | raise ValueError('invalid optimizer') 54 | optimizer = OptimCls(group_params, 55 | learning_rate=lr, beta1=opts.betas[0], beta2=opts.betas[1]) 56 | return optimizer 57 | -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/modules/train/parallel_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """Transformer Networks""" 16 | 17 | import math 18 | 19 | import numpy as np 20 | import mindspore.common.dtype as mstype 21 | from mindspore.context import ParallelMode 22 | 23 | class ParallelConfig: 24 | r""" 25 | ParallelConfig for the setting the global data parallel, model parallel and fusion group. 26 | """ 27 | dp = 8 28 | mp = 1 29 | pipeline_stage = 1 30 | recompute = False 31 | optimizer_shard = False 32 | fusion_group = 1 33 | parallel_mode = ParallelMode.SEMI_AUTO_PARALLEL 34 | vocab_emb_dp = False 35 | ep = dp 36 | capacity_factor = 1.5 37 | expert_num = 32 38 | aux_loss_factor = 0.01 39 | 40 | @staticmethod 41 | def set_global_parallel_config(dp=1, 42 | mp=1, 43 | recompute=True, 44 | stages=1, 45 | optimizer_shard=True, 46 | fusion_group=4, 47 | parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, 48 | vocab_emb_dp=True): 49 | r""" 50 | The parallel configure setting 51 | 52 | Args: 53 | dp (int): The data parallel way. Default: 1 54 | mp (int): The model parallel way. Default: 1 55 | stages (int): The number of the pipeline stage. Should be a positive value. Default: 1. 56 | optimizer_shard (bool): Enable optimizer state sharding or not. Default: True. 57 | fusion_group (int): The fusion group size of the optimizer state sharding. Default: 4. 58 | recompute (bool): Enable recomputation of the transformer block or not. Default: False. 59 | parallel_mode (ParallelMode): Can be SEMI_AUTO_PARALLEL, DATA_AUTO_PARALLEL or AUTO_PARALLEL. 60 | vocab_emb_dp (bool): Shard embedding in model parallel or data parallel. Default: True 61 | 62 | Supported Platforms: 63 | ``Ascend`` ``GPU`` 64 | 65 | Examples: 66 | >>> ParallelConfig(dp=1, mp=1) 67 | >>> ParallelConfig(stages=4) 68 | """ 69 | ParallelConfig.dp = dp 70 | ParallelConfig.mp = mp 71 | ParallelConfig.pipeline_stage = stages 72 | ParallelConfig.optimizer_shard = optimizer_shard 73 | ParallelConfig.fusion_group = fusion_group 74 | ParallelConfig.recompute = recompute 75 | ParallelConfig.parallel_mode = parallel_mode 76 | ParallelConfig.vocab_emb_dp = vocab_emb_dp 77 | -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/modules/train/tools.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ 16 | Copyright (c) Microsoft Corporation. 17 | Licensed under the MIT license. 18 | 19 | Misc utilities 20 | """ 21 | import json 22 | import os 23 | import sys 24 | import random 25 | import numpy as np 26 | import mindspore as ms 27 | 28 | class NoOp: 29 | """ useful for distributed training No-Ops """ 30 | 31 | def __getattr__(self, name): 32 | return self.noop 33 | 34 | def noop(self, *args, **kwargs): 35 | return 36 | 37 | 38 | def parse_with_config(args): 39 | """Parse With Config""" 40 | if args.train_config is not None: 41 | abs_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../..")) 42 | args.train_config = os.path.join(abs_path, args.train_config) 43 | config_args = json.load(open(args.train_config)) 44 | override_keys = {arg[2:].split('=')[0] for arg in sys.argv[1:] 45 | if arg.startswith('--')} 46 | for k, v in config_args.items(): 47 | if k not in override_keys: 48 | setattr(args, k, v) 49 | return args 50 | 51 | 52 | def set_random_seed(seed): 53 | """Set Random Seed""" 54 | print("random seed: ", seed) 55 | random.seed(seed) 56 | np.random.seed(seed) 57 | ms.set_seed(seed) 58 | 59 | class Struct: 60 | def __init__(self, dict_): 61 | self.__dict__.update(dict_) 62 | -------------------------------------------------------------------------------- /vision/wukong-huahua/ldm/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | import importlib 16 | from inspect import isfunction 17 | import mindspore.ops as ops 18 | 19 | 20 | def exists(x): 21 | return x is not None 22 | 23 | 24 | def default(val, d): 25 | if exists(val): 26 | return val 27 | return d() if isfunction(d) else d 28 | 29 | 30 | def count_params(model, verbose=False): 31 | total_params = sum(p.numel() for p in model.parameters()) 32 | if verbose: 33 | print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.") 34 | return total_params 35 | 36 | 37 | def instantiate_from_config(config): 38 | if not "target" in config: 39 | if config == '__is_first_stage__': 40 | return None 41 | elif config == "__is_unconditional__": 42 | return None 43 | raise KeyError("Expected key `target` to instantiate.") 44 | return get_obj_from_str(config["target"])(**config.get("params", dict())) 45 | 46 | 47 | def get_obj_from_str(string, reload=False): 48 | module, cls = string.rsplit(".", 1) 49 | if reload: 50 | module_imp = importlib.import_module(module) 51 | importlib.reload(module_imp) 52 | return getattr(importlib.import_module(module, package=None), cls) 53 | 54 | def extract_into_tensor(a, t, x_shape): 55 | b = t.shape[0] 56 | out = ops.GatherD()(a, -1, t) 57 | return out.reshape(b, *((1,) * (len(x_shape) - 1))) 58 | -------------------------------------------------------------------------------- /vision/wukong-huahua/requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | omegaconf 3 | einops 4 | ftfy 5 | regex 6 | albumentations 7 | pandas 8 | imagesize 9 | toolz 10 | pillow 11 | -------------------------------------------------------------------------------- /vision/wukong-huahua/scripts/run_db_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2023 Huawei Technologies Co., Ltd 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | export GLOG_v=3 18 | export HCCL_CONNECT_TIMEOUT=600 19 | export ASCEND_GLOBAL_LOG_LEVEL=3 20 | export ASCEND_SLOG_PRINT_TO_STDOUT=0 21 | device_id=3 22 | 23 | output_path=output 24 | task_name=α猫 25 | train_data_path=dataset/train_cat 26 | reg_data_path=dataset/reg_cat 27 | class_word=猫 28 | pretrained_model_path=models 29 | train_config_file=configs/train_db_config.json 30 | token=α 31 | 32 | rm -rf ${output_path:?}/${task_name:?} 33 | mkdir -p ${output_path:?}/${task_name:?} 34 | export RANK_SIZE=1;export DEVICE_ID=$device_id;export MS_COMPILER_CACHE_PATH=${output_path:?}/${task_name:?}; \ 35 | nohup python -u run_db_train.py \ 36 | --train_data_path=$train_data_path \ 37 | --reg_data_path=$reg_data_path \ 38 | --class_word=$class_word \ 39 | --token=$token \ 40 | --train_config=$train_config_file \ 41 | --output_path=$output_path/$task_name \ 42 | --use_parallel=False \ 43 | --pretrained_model_path=$pretrained_model_path \ 44 | > $output_path/$task_name/log_train 2>&1 & -------------------------------------------------------------------------------- /vision/wukong-huahua/scripts/run_inpaint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2022 Huawei Technologies Co., Ltd 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | export GLOG_v=3 18 | export ASCEND_GLOBAL_LOG_LEVEL=3 19 | export ASCEND_SLOG_PRINT_TO_STDOUT=0 20 | 21 | export DEVICE_ID=0; \ 22 | python inpaint.py \ 23 | --prompt "一只红色的狐狸坐在长椅上" \ 24 | --img demo/inpaint/overture-creations-5sI6fQgYIuo.png \ 25 | --mask demo/inpaint/overture-creations-5sI6fQgYIuo_mask.png \ 26 | --config configs/wukong-huahua_inpaint_inference.yaml \ 27 | --ckpt_name wukong-huahua-inpaint-ms.ckpt -------------------------------------------------------------------------------- /vision/wukong-huahua/scripts/run_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2022 Huawei Technologies Co., Ltd 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | export GLOG_v=3 18 | export HCCL_CONNECT_TIMEOUT=600 19 | export ASCEND_GLOBAL_LOG_LEVEL=3 20 | export ASCEND_SLOG_PRINT_TO_STDOUT=0 21 | device_id=2 22 | 23 | output_path=output/ 24 | task_name=txt2img 25 | data_path=dataset/ 26 | pretrained_model_path=models/ 27 | train_config_file=configs/train_config.json 28 | 29 | rm -rf ${output_path:?}/${task_name:?} 30 | mkdir -p ${output_path:?}/${task_name:?} 31 | export RANK_SIZE=1;export DEVICE_ID=$device_id;export MS_COMPILER_CACHE_PATH=${output_path:?}/${task_name:?}; \ 32 | nohup python -u run_train.py \ 33 | --data_path=$data_path \ 34 | --train_config=$train_config_file \ 35 | --output_path=$output_path/$task_name \ 36 | --use_parallel=False \ 37 | --pretrained_model_path=$pretrained_model_path \ 38 | > $output_path/$task_name/log_train 2>&1 & -------------------------------------------------------------------------------- /vision/wukong-huahua/scripts/run_train_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2022 Huawei Technologies Co., Ltd 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | export GLOG_v=3 18 | export HCCL_CONNECT_TIMEOUT=600 19 | export ASCEND_GLOBAL_LOG_LEVEL=3 20 | export ASCEND_SLOG_PRINT_TO_STDOUT=0 21 | device_id=6 22 | 23 | output_path=output/ 24 | task_name=txt2img 25 | data_path=dataset 26 | pretrained_model_path=models/ 27 | train_config_file=configs/train_config.json 28 | 29 | rm -rf ${output_path:?}/${task_name:?} 30 | mkdir -p ${output_path:?}/${task_name:?} 31 | export RANK_SIZE=1;export DEVICE_ID=$device_id;export MS_COMPILER_CACHE_PATH=${output_path:?}/${task_name:?}; \ 32 | nohup python -u run_train.py \ 33 | --data_path=$data_path \ 34 | --train_config=$train_config_file \ 35 | --output_path=$output_path/$task_name \ 36 | --use_parallel=False \ 37 | --pretrained_model_path=$pretrained_model_path \ 38 | --model_config=configs/v1-train-chinese-lora.yaml \ 39 | --start_learning_rate=1e-4 \ 40 | --end_learning_rate=1e-6 \ 41 | --enable_lora=True \ 42 | > $output_path/$task_name/train_1p_lora.log 2>&1 & -------------------------------------------------------------------------------- /vision/wukong-huahua/scripts/run_train_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # -*- coding: UTF-8 -*- 3 | # Copyright 2022 Huawei Technologies Co., Ltd 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================ 17 | 18 | output_path=output/ 19 | task_name=txt2img 20 | data_path=dataset/ 21 | pretrained_model_path=models/ 22 | train_config_file=configs/train_config.json 23 | 24 | if [ $# != 3 ] 25 | then 26 | echo "Usage: 27 | bash scripts/run_train_parallel.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE]" 28 | exit 1 29 | fi 30 | 31 | if [ $1 -lt 1 ] || [ $1 -gt 8 ] 32 | then 33 | echo "error: DEVICE_NUM=$1 is not in [1,8]" 34 | exit 1 35 | fi 36 | 37 | VISIABLE_DEVICES=$2 38 | IFS="," read -r -a CANDIDATE_DEVICE <<< "$VISIABLE_DEVICES" 39 | if [ ${#CANDIDATE_DEVICE[@]} -ne $1 ] 40 | then 41 | echo "error: DEVICE_NUM=$1 is not matched with VISIABLE_DEVICES=$2" 42 | exit 1 43 | fi 44 | 45 | if [ ! -f $3 ] 46 | then 47 | echo "error: RANK_TABLE_FILE=$3 is not a file" 48 | exit 1 49 | fi 50 | 51 | export GLOG_v=3 52 | export ASCEND_GLOBAL_LOG_LEVEL=3 53 | export ASCEND_GLOBAL_EVENT_ENABLE=0 54 | export ASCEND_SLOG_PRINT_TO_STDOUT=1 55 | export HCCL_CONNECT_TIMEOUT=600 56 | 57 | ulimit -u unlimited 58 | ulimit -SHn 65535 59 | export DEVICE_NUM=$1 60 | export RANK_SIZE=$1 61 | RANK_TABLE_FILE=$(realpath $3) 62 | export RANK_TABLE_FILE 63 | echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}" 64 | 65 | rm -rf ${output_path:?}/${task_name:?} 66 | mkdir -p ${output_path:?}/${task_name:?} 67 | export MS_COMPILER_CACHE_PATH=${output_path:?}/${task_name:?} 68 | export SERVER_ID=0 69 | rank_start=$((DEVICE_NUM * SERVER_ID)) 70 | for((i=0; i<${RANK_SIZE}; i++)) 71 | do 72 | export RANK_ID=$((rank_start + i)) 73 | export DEVICE_ID=${CANDIDATE_DEVICE[i]} 74 | mkdir -p ${output_path:?}/${task_name:?}/rank_$i 75 | echo "start training for rank $RANK_ID, device $DEVICE_ID" 76 | nohup python -u run_train.py \ 77 | --use_parallel=True \ 78 | --data_path=$data_path \ 79 | --train_config=$train_config_file \ 80 | --output_path=$output_path/$task_name \ 81 | --pretrained_model_path=$pretrained_model_path \ 82 | > $output_path/$task_name/rank_$i/log_train 2>&1 & 83 | done -------------------------------------------------------------------------------- /vision/wukong-huahua/scripts/run_train_parallel_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # -*- coding: UTF-8 -*- 3 | # Copyright 2022 Huawei Technologies Co., Ltd 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================ 17 | output_path=output/ 18 | task_name=txt2img 19 | data_path=dataset 20 | pretrained_model_path=models/ 21 | train_config_file=configs/train_config.json 22 | 23 | if [ $# != 3 ] 24 | then 25 | echo "Usage: 26 | bash scripts/run_train_parallel.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE]" 27 | exit 1 28 | fi 29 | 30 | if [ $1 -lt 1 ] || [ $1 -gt 8 ] 31 | then 32 | echo "error: DEVICE_NUM=$1 is not in [1,8]" 33 | exit 1 34 | fi 35 | 36 | VISIABLE_DEVICES=$2 37 | IFS="," read -r -a CANDIDATE_DEVICE <<< "$VISIABLE_DEVICES" 38 | if [ ${#CANDIDATE_DEVICE[@]} -ne $1 ] 39 | then 40 | echo "error: DEVICE_NUM=$1 is not matched with VISIABLE_DEVICES=$2" 41 | exit 1 42 | fi 43 | 44 | if [ ! -f $3 ] 45 | then 46 | echo "error: RANK_TABLE_FILE=$3 is not a file" 47 | exit 1 48 | fi 49 | 50 | export GLOG_v=3 51 | export ASCEND_GLOBAL_LOG_LEVEL=3 52 | export ASCEND_GLOBAL_EVENT_ENABLE=0 53 | export ASCEND_SLOG_PRINT_TO_STDOUT=1 54 | export HCCL_CONNECT_TIMEOUT=600 55 | 56 | ulimit -u unlimited 57 | ulimit -SHn 65535 58 | export DEVICE_NUM=$1 59 | export RANK_SIZE=$1 60 | RANK_TABLE_FILE=$(realpath $3) 61 | export RANK_TABLE_FILE 62 | echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}" 63 | 64 | rm -rf ${output_path:?}/${task_name:?} 65 | mkdir -p ${output_path:?}/${task_name:?} 66 | export MS_COMPILER_CACHE_PATH=${output_path:?}/${task_name:?} 67 | export SERVER_ID=0 68 | rank_start=$((DEVICE_NUM * SERVER_ID)) 69 | for((i=0; i<${RANK_SIZE}; i++)) 70 | do 71 | export RANK_ID=$((rank_start + i)) 72 | export DEVICE_ID=${CANDIDATE_DEVICE[i]} 73 | mkdir -p ${output_path:?}/${task_name:?}/rank_$i 74 | echo "start training for rank $RANK_ID, device $DEVICE_ID" 75 | nohup python -u run_train.py \ 76 | --use_parallel=True \ 77 | --data_path=$data_path \ 78 | --train_config=$train_config_file \ 79 | --output_path=$output_path/$task_name \ 80 | --pretrained_model_path=$pretrained_model_path \ 81 | --model_config=configs/v1-train-chinese-lora.yaml \ 82 | --start_learning_rate=1e-4 \ 83 | --end_learning_rate=1e-6 \ 84 | --enable_lora=True \ 85 | > $output_path/$task_name/rank_$i/train_lora_8p.log 2>&1 & 86 | done -------------------------------------------------------------------------------- /vision/wukong-huahua/scripts/run_txt2img.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | # Copyright 2022 Huawei Technologies Co., Ltd 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================ 17 | 18 | export GLOG_v=3 19 | export ASCEND_GLOBAL_LOG_LEVEL=3 20 | export ASCEND_SLOG_PRINT_TO_STDOUT=0 21 | 22 | export DEVICE_ID=0; \ 23 | python txt2img.py \ 24 | --prompt "来自深渊 风景 绘画 写实风格" \ 25 | --config configs/v1-inference-chinese.yaml \ 26 | --output_path ./output/ \ 27 | --seed 42 \ 28 | --dpm_solver \ 29 | --n_iter 4 \ 30 | --n_samples 4 \ 31 | --W 512 \ 32 | --H 512 \ 33 | --ddim_steps 15 34 | -------------------------------------------------------------------------------- /vision/wukong-huahua/scripts/run_txt2img_lora.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | # Copyright 2022 Huawei Technologies Co., Ltd 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================ 17 | 18 | export GLOG_v=3 19 | export ASCEND_GLOBAL_LOG_LEVEL=3 20 | export ASCEND_SLOG_PRINT_TO_STDOUT=0 21 | 22 | export DEVICE_ID=7; \ 23 | python txt2img.py \ 24 | --prompt "一个带着红色蝴蝶结的小女孩" \ 25 | --config configs/v1-inference-chinese-lora.yaml \ 26 | --output_path ./output/ \ 27 | --enable_lora True \ 28 | --lora_ckpt_filepath ./output/txt2img_lora/ckpt/rank_0/wkhh_txt2img_lora-12_1224.ckpt \ 29 | --seed 42 \ 30 | --n_iter 4 \ 31 | --n_samples 4 \ 32 | --W 512 \ 33 | --H 512 \ 34 | --ddim_steps 30 > test_lora.log 2>&1 & 35 | --------------------------------------------------------------------------------