├── LICENSE ├── README.md ├── asset ├── images │ ├── car │ │ ├── image_end.png │ │ └── image_start.png │ ├── cat │ │ ├── image_end.JPG │ │ └── image_start.JPG │ └── folwer │ │ ├── image_end.png │ │ └── image_start.png ├── logo.gif ├── samples.txt └── teaser.png ├── docs ├── index.html └── static │ ├── css │ ├── bulma-carousel.min.css │ ├── bulma-slider.min.css │ ├── bulma.css.map.txt │ ├── bulma.min.css │ ├── custom.css │ ├── fontawesome.all.min.css │ └── index.css │ ├── font_style │ └── jackbrush-alj9a.ttf │ ├── images │ ├── background.jpg │ ├── huggingface_logo.svg │ ├── i23d │ │ ├── 1-1.png │ │ ├── 1-2.png │ │ ├── 1-3.png │ │ ├── 1-4.png │ │ ├── 1-5.png │ │ ├── 1-6.png │ │ ├── 1.png │ │ ├── 2-1.png │ │ ├── 2-2.png │ │ ├── 2-3.png │ │ ├── 2-4.png │ │ ├── 2-5.png │ │ ├── 2-6.png │ │ ├── 2.png │ │ ├── 3-1.png │ │ ├── 3-2.png │ │ ├── 3-3.png │ │ ├── 3-4.png │ │ ├── 3-5.png │ │ ├── 3-6.png │ │ ├── 3.png │ │ ├── 4-1.png │ │ ├── 4-2.png │ │ ├── 4-3.png │ │ ├── 4-4.png │ │ ├── 4-5.png │ │ ├── 4-6.png │ │ └── 4.png │ ├── i2v │ │ ├── .DS_Store │ │ ├── boat │ │ │ ├── .DS_Store │ │ │ ├── boat.gif │ │ │ └── boat.png │ │ ├── cake │ │ │ ├── .DS_Store │ │ │ ├── cake.gif │ │ │ └── cake.png │ │ ├── cloud │ │ │ ├── cloud.gif │ │ │ └── cloud.png │ │ ├── fire │ │ │ ├── .DS_Store │ │ │ ├── fire.gif │ │ │ └── fire.png │ │ ├── meteor │ │ │ ├── .DS_Store │ │ │ ├── meteor.gif │ │ │ └── meteor.png │ │ └── waterfall │ │ │ ├── .DS_Store │ │ │ ├── waterfall.gif │ │ │ └── waterfall.png │ ├── icon.png │ ├── interpolation │ │ ├── car │ │ │ ├── end.png │ │ │ ├── interpolation.gif │ │ │ ├── interpolation.mp4 │ │ │ └── start.png │ │ └── cat_tiger │ │ │ ├── end.png │ │ │ ├── interpolation.gif │ │ │ ├── interpolation.mp4 │ │ │ └── start.png │ ├── logo.gif │ ├── logo.png │ ├── method │ │ └── method.png │ └── t2i │ │ ├── A baby rabbit wearing a tiny knitted hat, ultra-detailed, photorealistic.jpg │ │ ├── A bear with fur made of chocolate shavings, standing in a clearing filled with marshmallow mushrooms.jpeg │ │ ├── A breathtaking view of the Swiss Alps during sunrise, with snow-capped peaks and lush green valleys, ultra-realistic, high detail.jpg │ │ ├── A close-up of a sunlit butterfly resting on a flower in a garden.jpeg │ │ ├── A close-up of a vibrant, fully bloomed red rose with dew drops on its petals.jpeg │ │ ├── A close-up photograph of a lion with its mane blowing in the wind against the savanna backdrop.jpeg │ │ ├── A curious dolphin leaping out of the water, creating splashes in the sunlight.jpeg │ │ ├── A cyborg superhero with a robotic arm and high-tech gadgets, standing atop a skyscraper.jpg │ │ ├── A detailed close-up of a rusted vintage car abandoned in an overgrown field.jpg │ │ ├── A dragon made of molten chocolate, with scales that glisten like gold leaf and eyes of crystalline sugar.jpeg │ │ ├── A dramatic mountain range during a thunderstorm, with dark clouds, lightning strikes, and rugged terrain.jpg │ │ ├── A gorilla wearing an advanced robotic suit with pulsating energy cores, standing on the edge of a futuristic skyline.jpeg │ │ ├── A group of astronauts standing on the surface of Mars, with Earth visible in the distant sky.png │ │ ├── A hippopotamus with a body of jelly-like translucent gelatin, lounging in a pool of liquid sherbet.jpeg │ │ ├── A lion made entirely of layered caramel and chocolate, with a mane composed of spun sugar flames.jpeg │ │ ├── A lion with a mane made of holographic flames, standing on a crystal platform in a neon-lit jungle.jpeg │ │ ├── A magical forest with glowing plants, where a young anime girl with long hair discovers a hidden portal.jpg │ │ ├── A majestic bald eagle soaring over a snowy mountain range.jpeg │ │ ├── A majestic mountain range under a starry sky, with swirling clouds and glowing moonlight, inspired by Van Gogh.jpeg │ │ ├── A peaceful forest in autumn, with golden leaves falling and a stream running through it, illuminated by soft sunlight.png │ │ ├── A peaceful mountain lake reflecting the surrounding pine trees and snowy peaks, photorealistic, tranquil.jpg │ │ ├── A phoenix-like bird with wings made of fiery red fruit leather and a beak of candied citrus peel.jpeg │ │ ├── A realistic photograph of a wolf howling at the moon in a snowy forest.jpeg │ │ ├── A rustic bedroom showcasing a round bed, earth-toned decor, and a cluttered, yet charming ambiance.jpg │ │ ├── A stealthy ninja superhero in a dark alley, showcasing agility and advanced technology.jpg │ │ ├── A wolf constructed from layers of dark chocolate and nougat, with glowing eyes made of candied cherries.jpeg │ │ ├── An owl constructed from layers of caramel popcorn and hazelnut chocolate, perched on a pretzel branch.jpeg │ │ ├── Bentley Bacalar driving on black tarmac road with trees in background, Sumi-e drawing, white background 8k.jpg │ │ ├── Documentary-style photography of a bustling marketplace in Marrakech, with spices and textiles.png │ │ ├── Kraken is listening to music with headphones.png │ │ ├── Post-Apocalyptic Wanderer, character design, style by kim jung gi, zabrocki, karlkka, jayison devadas, 8k.png │ │ ├── The picture shows a cute little tiger, wearing a blue hoodie and hat, sitting on a small cardboard boat on calm water.png │ │ ├── Two baby ducks swimming in a pond at sunset, highly detailed, hyper-realistic.jpg │ │ ├── Two female rabbit adventurers dressed in a fancy velvet coats next to a Christmas tree, Christmas theme, on an antique opulent background , jean - baptiste monge , smooth, anthropomorphic photorealistic, photography, lifelike, high resolution, smooth.jpg │ │ ├── beautiful lady,freckles, big smile,blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light,dark grey background.jpeg │ │ ├── cloud.jpg │ │ └── golden sunset shines on the top of snow-capped mountains, with small villages at its foot and surrounding buildings.png │ └── js │ ├── bulma-carousel.js │ ├── bulma-carousel.min.js │ ├── bulma-slider.js │ ├── bulma-slider.min.js │ ├── choose_image.js │ ├── fontawesome.all.min.js │ ├── image2gif.js │ ├── index.js │ ├── scroll.js │ └── video_comparison.js ├── gradio_demos ├── lumos_I2I.py └── lumos_T2I.py ├── lumos_diffusion ├── __init__.py ├── dpm_solver.py ├── dpm_solver_inter.py └── model │ ├── __init__.py │ ├── builder.py │ ├── diffusion_utils.py │ ├── dino │ └── vision_transformer.py │ ├── dpm_solver.py │ ├── dpm_solver_inter.py │ ├── gaussian_diffusion.py │ ├── gaussian_diffusion_inter.py │ ├── lumos │ ├── LumosI2I.py │ ├── LumosT2I.py │ ├── LumosT2IMS.py │ ├── Lumos_blocks.py │ └── __init__.py │ ├── t5.py │ ├── timestep_sampler.py │ └── utils.py ├── requirements.txt └── utils ├── __init__.py ├── download.py └── resolution.py /LICENSE: -------------------------------------------------------------------------------- 1 | ------------------------------ LICENSE for Lumos ------------------------------ 2 | 3 | Copyright (c) 2024 Ant Group. 4 | 5 | MIT License 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | ###
Learning Visual Generative Priors without Text
6 |
7 |
8 | Shuailei Ma*1, 9 | Kecheng Zheng*2, 10 | Ying Wei✉️1, Wei Wu2, Fan Lu2, 11 | Yifei Zhang3,Chen-Wei Xie4, 12 | Biao Gong2, 13 | Jiapeng Zhu5, 14 | Yujun Shen✉️2
15 | 1Northeastern University, China 2Ant Group 3SJTU 4Alibaba Group 5HKUST
16 | *equal contribution ✉️corresponding author 17 |
18 |
19 |
20 |   21 |   22 |   23 |
24 |
25 | 26 | ## 📝 Content 27 | * [Update Log](#-update-log) 28 | * [Abstract](#-abstract) 29 | * [Setup](#️-setup) 30 | * [Citation](#-citation) 31 | * [License](#license) 32 | * [Acknowledgement](#acknowledgement) 33 | 34 | 35 | ## 📣 Update Log 36 | - [2024.11.21] 🎉 Here comes Lumos, we release the code and gradio demos of Lumos-I2I and Lumos-T2I. 37 | 38 | ## 🪄✨ Abstract 39 | TL; DR: Lumos is a pure vision-based generative framework, which confirms the feasibility and the scalability of learning visual generative priors. It can be efficiently adapted to visual generative tasks such as text-to-image, image-to-3D, and image-to-video generation. 40 |
CLICK for the full abstract 41 | Although text-to-image (T2I) models have recently thrived as visual generative priors, their reliance on high-quality text-image pairs makes scaling up expensive. 42 | We argue that grasping the cross-modality alignment is not a necessity for a sound visual generative prior, whose focus should be on texture modeling. 43 | Such a philosophy inspires us to study image-to-image (I2I) generation, where models can learn from in-the-wild images in a self-supervised manner. 44 | We first develop a pure vision-based training framework, Lumos, and confirm the feasibility and the scalability of learning I2I models. 45 | We then find that, as an upstream task of T2I, our I2I model serves as a more foundational visual prior and achieves on-par or better performance than existing T2I models using only 1/10 text-image pairs for fine-tuning. 46 | We further demonstrate the superiority of I2I priors over T2I priors on some text-irrelevant visual generative tasks, like image-to-3D and image-to-video. 47 |
48 | 49 | ![Visualization various downstream tasks of Lumos](asset/teaser.png) 50 | 51 | 52 | ## ⚙️ Setup 53 | Follow the following guide to set up the environment. 54 | - Python >= 3.9 (Recommend to use [Anaconda](https://www.anaconda.com/download/#linux) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html)) 55 | - [PyTorch >= 2.2.1+cu11.8](https://pytorch.org/) 56 | - Better create a virtual environment 57 | 58 | Install the required dependencies by following the command. 59 | 60 | 1. git clone repo. 61 | ``` 62 | git clone https://github.com/xiaomabufei/lumos.git 63 | cd lumos 64 | ``` 65 | 2. download model checkpoints 66 | ``` 67 | mkdir ./checkpoints && cd ./checkpoints 68 | git lfs install 69 | git clone https://huggingface.co/Xiaomabufei/lumos 70 | ``` 71 | 72 | 3. create environment 73 | ``` 74 | conda create -n lumos python=3.9 -y 75 | conda activate lumos 76 | ``` 77 | 78 | 4. install torch with GPU support 79 | ``` 80 | pip install torch==2.2.1+cu118 torchvision==0.17.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html 81 | ``` 82 | 83 | 5. install xformers corresponding to torch and cuda 84 | ``` 85 | pip install -U xformers==0.0.25 86 | ``` 87 | 88 | 6. install the remaining environment 89 | ``` 90 | pip install -r requirements.txt 91 | ``` 92 | 93 | 7. run lumos Image Interpolation 94 | ``` 95 | python gradio_demos/lumos_I2I.py 96 | ``` 97 | 98 | 8. run lumos Text-to-Image Generation 99 | ``` 100 | python gradio_demos/lumos_T2I.py 101 | ``` 102 | If you are mainland user, you may try `export HF_ENDPOINT=https://hf-mirror.com` to use huggingface mirror to facilitate the download of some necessary checkpoints to run our system. 103 | 104 | ## 📖 Citation 105 | Don't forget to cite this source if it proves useful in your research! 106 | ```bibtex 107 | @article{Lumos2024, 108 | title={Learning Visual Generative Priors without Text}, 109 | author={Ma, Shuailei and Zheng, Kecheng and Wei, Ying and Wu, Wei and Lu, Fan and Zhang, Yifei and Xie, Chen-Wei and Gong, Biao and Zhu, Jiapeng and Shen, Yujun}, 110 | year={2024}, 111 | eprint={arxiv}, 112 | archivePrefix={arXiv}, 113 | primaryClass={cs.CV}} 114 | ``` 115 | 116 | ## License 117 | This repository is released under the MiT license as found in the [LICENSE](LICENSE) file. 118 | 119 | ## Acknowledgement 120 | Our implementation is based on [DiT](https://github.com/nullquant/ComfyUI-BrushNet), [Pixart-α](https://github.com/facebookresearch/DiT) and [Dino](https://github.com/facebookresearch/dino). Thanks for their remarkable contribution and released code! 121 | -------------------------------------------------------------------------------- /asset/images/car/image_end.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/asset/images/car/image_end.png -------------------------------------------------------------------------------- /asset/images/car/image_start.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/asset/images/car/image_start.png -------------------------------------------------------------------------------- /asset/images/cat/image_end.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/asset/images/cat/image_end.JPG -------------------------------------------------------------------------------- /asset/images/cat/image_start.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/asset/images/cat/image_start.JPG -------------------------------------------------------------------------------- /asset/images/folwer/image_end.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/asset/images/folwer/image_end.png -------------------------------------------------------------------------------- /asset/images/folwer/image_start.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/asset/images/folwer/image_start.png -------------------------------------------------------------------------------- /asset/logo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/asset/logo.gif -------------------------------------------------------------------------------- /asset/samples.txt: -------------------------------------------------------------------------------- 1 | A close-up of a vibrant, fully bloomed red rose with dew drops on its petals -------------------------------------------------------------------------------- /asset/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/asset/teaser.png -------------------------------------------------------------------------------- /docs/static/css/bulma-carousel.min.css: -------------------------------------------------------------------------------- 1 | @-webkit-keyframes spinAround{from{-webkit-transform:rotate(0);transform:rotate(0)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes spinAround{from{-webkit-transform:rotate(0);transform:rotate(0)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.slider{position:relative;width:100%}.slider-container{display:flex;flex-wrap:nowrap;flex-direction:row;overflow:hidden;-webkit-transform:translate3d(0,0,0);transform:translate3d(0,0,0);min-height:100%}.slider-container.is-vertical{flex-direction:column}.slider-container .slider-item{flex:none}.slider-container .slider-item .image.is-covered img{-o-object-fit:cover;object-fit:cover;-o-object-position:center center;object-position:center center;height:100%;width:100%}.slider-container .slider-item .video-container{height:0;padding-bottom:0;padding-top:56.25%;margin:0;position:relative}.slider-container .slider-item .video-container.is-1by1,.slider-container .slider-item .video-container.is-square{padding-top:100%}.slider-container .slider-item .video-container.is-4by3{padding-top:75%}.slider-container .slider-item .video-container.is-21by9{padding-top:42.857143%}.slider-container .slider-item .video-container embed,.slider-container .slider-item .video-container iframe,.slider-container .slider-item .video-container object{position:absolute;top:0;left:0;width:100%!important;height:100%!important}.slider-navigation-next,.slider-navigation-previous{display:flex;justify-content:center;align-items:center;position:absolute;width:42px;height:42px;background:#fff center center no-repeat;background-size:20px 20px;border:1px solid #fff;border-radius:25091983px;box-shadow:0 2px 5px #3232321a;top:50%;margin-top:-20px;left:0;cursor:pointer;transition:opacity .3s,-webkit-transform .3s;transition:transform .3s,opacity .3s;transition:transform .3s,opacity .3s,-webkit-transform .3s}.slider-navigation-next:hover,.slider-navigation-previous:hover{-webkit-transform:scale(1.2);transform:scale(1.2)}.slider-navigation-next.is-hidden,.slider-navigation-previous.is-hidden{display:none;opacity:0}.slider-navigation-next svg,.slider-navigation-previous svg{width:25%}.slider-navigation-next{left:auto;right:0;background:#fff center center no-repeat;background-size:20px 20px}.slider-pagination{display:none;justify-content:center;align-items:center;position:absolute;bottom:0;left:0;right:0;padding:.5rem 1rem;text-align:center}.slider-pagination .slider-page{background:#fff;width:10px;height:10px;border-radius:25091983px;display:inline-block;margin:0 3px;box-shadow:0 2px 5px #3232321a;transition:-webkit-transform .3s;transition:transform .3s;transition:transform .3s,-webkit-transform .3s;cursor:pointer}.slider-pagination .slider-page.is-active,.slider-pagination .slider-page:hover{-webkit-transform:scale(1.4);transform:scale(1.4)}@media screen and (min-width:800px){.slider-pagination{display:flex}}.hero.has-carousel{position:relative}.hero.has-carousel+.hero-body,.hero.has-carousel+.hero-footer,.hero.has-carousel+.hero-head{z-index:10;overflow:hidden}.hero.has-carousel .hero-carousel{position:absolute;top:0;left:0;bottom:0;right:0;height:auto;border:none;margin:auto;padding:0;z-index:0}.hero.has-carousel .hero-carousel .slider{width:100%;max-width:100%;overflow:hidden;height:100%!important;max-height:100%;z-index:0}.hero.has-carousel .hero-carousel .slider .has-background{max-height:100%}.hero.has-carousel .hero-carousel .slider .has-background .is-background{-o-object-fit:cover;object-fit:cover;-o-object-position:center center;object-position:center center;height:100%;width:100%}.hero.has-carousel .hero-body{margin:0 3rem;z-index:10} -------------------------------------------------------------------------------- /docs/static/css/bulma-slider.min.css: -------------------------------------------------------------------------------- 1 | @-webkit-keyframes spinAround{from{-webkit-transform:rotate(0);transform:rotate(0)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes spinAround{from{-webkit-transform:rotate(0);transform:rotate(0)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}input[type=range].slider{-webkit-appearance:none;-moz-appearance:none;appearance:none;margin:1rem 0;background:0 0;touch-action:none}input[type=range].slider.is-fullwidth{display:block;width:100%}input[type=range].slider:focus{outline:0}input[type=range].slider:not([orient=vertical])::-webkit-slider-runnable-track{width:100%}input[type=range].slider:not([orient=vertical])::-moz-range-track{width:100%}input[type=range].slider:not([orient=vertical])::-ms-track{width:100%}input[type=range].slider:not([orient=vertical]).has-output+output,input[type=range].slider:not([orient=vertical]).has-output-tooltip+output{width:3rem;background:#4a4a4a;border-radius:4px;padding:.4rem .8rem;font-size:.75rem;line-height:.75rem;text-align:center;text-overflow:ellipsis;white-space:nowrap;color:#fff;overflow:hidden;pointer-events:none;z-index:200}input[type=range].slider:not([orient=vertical]).has-output-tooltip:disabled+output,input[type=range].slider:not([orient=vertical]).has-output:disabled+output{opacity:.5}input[type=range].slider:not([orient=vertical]).has-output{display:inline-block;vertical-align:middle;width:calc(100% - (4.2rem))}input[type=range].slider:not([orient=vertical]).has-output+output{display:inline-block;margin-left:.75rem;vertical-align:middle}input[type=range].slider:not([orient=vertical]).has-output-tooltip{display:block}input[type=range].slider:not([orient=vertical]).has-output-tooltip+output{position:absolute;left:0;top:-.1rem}input[type=range].slider[orient=vertical]{-webkit-appearance:slider-vertical;-moz-appearance:slider-vertical;appearance:slider-vertical;-webkit-writing-mode:bt-lr;-ms-writing-mode:bt-lr;writing-mode:bt-lr}input[type=range].slider[orient=vertical]::-webkit-slider-runnable-track{height:100%}input[type=range].slider[orient=vertical]::-moz-range-track{height:100%}input[type=range].slider[orient=vertical]::-ms-track{height:100%}input[type=range].slider::-webkit-slider-runnable-track{cursor:pointer;animate:.2s;box-shadow:0 0 0 #7a7a7a;background:#dbdbdb;border-radius:4px;border:0 solid #7a7a7a}input[type=range].slider::-moz-range-track{cursor:pointer;animate:.2s;box-shadow:0 0 0 #7a7a7a;background:#dbdbdb;border-radius:4px;border:0 solid #7a7a7a}input[type=range].slider::-ms-track{cursor:pointer;animate:.2s;box-shadow:0 0 0 #7a7a7a;background:#dbdbdb;border-radius:4px;border:0 solid #7a7a7a}input[type=range].slider::-ms-fill-lower{background:#dbdbdb;border-radius:4px}input[type=range].slider::-ms-fill-upper{background:#dbdbdb;border-radius:4px}input[type=range].slider::-webkit-slider-thumb{box-shadow:none;border:1px solid #b5b5b5;border-radius:4px;background:#fff;cursor:pointer}input[type=range].slider::-moz-range-thumb{box-shadow:none;border:1px solid #b5b5b5;border-radius:4px;background:#fff;cursor:pointer}input[type=range].slider::-ms-thumb{box-shadow:none;border:1px solid #b5b5b5;border-radius:4px;background:#fff;cursor:pointer}input[type=range].slider::-webkit-slider-thumb{-webkit-appearance:none;appearance:none}input[type=range].slider.is-circle::-webkit-slider-thumb{border-radius:290486px}input[type=range].slider.is-circle::-moz-range-thumb{border-radius:290486px}input[type=range].slider.is-circle::-ms-thumb{border-radius:290486px}input[type=range].slider:active::-webkit-slider-thumb{-webkit-transform:scale(1.25);transform:scale(1.25)}input[type=range].slider:active::-moz-range-thumb{transform:scale(1.25)}input[type=range].slider:active::-ms-thumb{transform:scale(1.25)}input[type=range].slider:disabled{opacity:.5;cursor:not-allowed}input[type=range].slider:disabled::-webkit-slider-thumb{cursor:not-allowed;-webkit-transform:scale(1);transform:scale(1)}input[type=range].slider:disabled::-moz-range-thumb{cursor:not-allowed;transform:scale(1)}input[type=range].slider:disabled::-ms-thumb{cursor:not-allowed;transform:scale(1)}input[type=range].slider:not([orient=vertical]){min-height:calc((1rem + 2px) * 1.25)}input[type=range].slider:not([orient=vertical])::-webkit-slider-runnable-track{height:.5rem}input[type=range].slider:not([orient=vertical])::-moz-range-track{height:.5rem}input[type=range].slider:not([orient=vertical])::-ms-track{height:.5rem}input[type=range].slider[orient=vertical]::-webkit-slider-runnable-track{width:.5rem}input[type=range].slider[orient=vertical]::-moz-range-track{width:.5rem}input[type=range].slider[orient=vertical]::-ms-track{width:.5rem}input[type=range].slider::-webkit-slider-thumb{height:1rem;width:1rem}input[type=range].slider::-moz-range-thumb{height:1rem;width:1rem}input[type=range].slider::-ms-thumb{height:1rem;width:1rem}input[type=range].slider::-ms-thumb{margin-top:0}input[type=range].slider::-webkit-slider-thumb{margin-top:-.25rem}input[type=range].slider[orient=vertical]::-webkit-slider-thumb{margin-top:auto;margin-left:-.25rem}input[type=range].slider.is-small:not([orient=vertical]){min-height:calc((.75rem + 2px) * 1.25)}input[type=range].slider.is-small:not([orient=vertical])::-webkit-slider-runnable-track{height:.375rem}input[type=range].slider.is-small:not([orient=vertical])::-moz-range-track{height:.375rem}input[type=range].slider.is-small:not([orient=vertical])::-ms-track{height:.375rem}input[type=range].slider.is-small[orient=vertical]::-webkit-slider-runnable-track{width:.375rem}input[type=range].slider.is-small[orient=vertical]::-moz-range-track{width:.375rem}input[type=range].slider.is-small[orient=vertical]::-ms-track{width:.375rem}input[type=range].slider.is-small::-webkit-slider-thumb{height:.75rem;width:.75rem}input[type=range].slider.is-small::-moz-range-thumb{height:.75rem;width:.75rem}input[type=range].slider.is-small::-ms-thumb{height:.75rem;width:.75rem}input[type=range].slider.is-small::-ms-thumb{margin-top:0}input[type=range].slider.is-small::-webkit-slider-thumb{margin-top:-.1875rem}input[type=range].slider.is-small[orient=vertical]::-webkit-slider-thumb{margin-top:auto;margin-left:-.1875rem}input[type=range].slider.is-medium:not([orient=vertical]){min-height:calc((1.25rem + 2px) * 1.25)}input[type=range].slider.is-medium:not([orient=vertical])::-webkit-slider-runnable-track{height:.625rem}input[type=range].slider.is-medium:not([orient=vertical])::-moz-range-track{height:.625rem}input[type=range].slider.is-medium:not([orient=vertical])::-ms-track{height:.625rem}input[type=range].slider.is-medium[orient=vertical]::-webkit-slider-runnable-track{width:.625rem}input[type=range].slider.is-medium[orient=vertical]::-moz-range-track{width:.625rem}input[type=range].slider.is-medium[orient=vertical]::-ms-track{width:.625rem}input[type=range].slider.is-medium::-webkit-slider-thumb{height:1.25rem;width:1.25rem}input[type=range].slider.is-medium::-moz-range-thumb{height:1.25rem;width:1.25rem}input[type=range].slider.is-medium::-ms-thumb{height:1.25rem;width:1.25rem}input[type=range].slider.is-medium::-ms-thumb{margin-top:0}input[type=range].slider.is-medium::-webkit-slider-thumb{margin-top:-.3125rem}input[type=range].slider.is-medium[orient=vertical]::-webkit-slider-thumb{margin-top:auto;margin-left:-.3125rem}input[type=range].slider.is-large:not([orient=vertical]){min-height:calc((1.5rem + 2px) * 1.25)}input[type=range].slider.is-large:not([orient=vertical])::-webkit-slider-runnable-track{height:.75rem}input[type=range].slider.is-large:not([orient=vertical])::-moz-range-track{height:.75rem}input[type=range].slider.is-large:not([orient=vertical])::-ms-track{height:.75rem}input[type=range].slider.is-large[orient=vertical]::-webkit-slider-runnable-track{width:.75rem}input[type=range].slider.is-large[orient=vertical]::-moz-range-track{width:.75rem}input[type=range].slider.is-large[orient=vertical]::-ms-track{width:.75rem}input[type=range].slider.is-large::-webkit-slider-thumb{height:1.5rem;width:1.5rem}input[type=range].slider.is-large::-moz-range-thumb{height:1.5rem;width:1.5rem}input[type=range].slider.is-large::-ms-thumb{height:1.5rem;width:1.5rem}input[type=range].slider.is-large::-ms-thumb{margin-top:0}input[type=range].slider.is-large::-webkit-slider-thumb{margin-top:-.375rem}input[type=range].slider.is-large[orient=vertical]::-webkit-slider-thumb{margin-top:auto;margin-left:-.375rem}input[type=range].slider.is-white::-moz-range-track{background:#fff!important}input[type=range].slider.is-white::-webkit-slider-runnable-track{background:#fff!important}input[type=range].slider.is-white::-ms-track{background:#fff!important}input[type=range].slider.is-white::-ms-fill-lower{background:#fff}input[type=range].slider.is-white::-ms-fill-upper{background:#fff}input[type=range].slider.is-white .has-output-tooltip+output,input[type=range].slider.is-white.has-output+output{background-color:#fff;color:#0a0a0a}input[type=range].slider.is-black::-moz-range-track{background:#0a0a0a!important}input[type=range].slider.is-black::-webkit-slider-runnable-track{background:#0a0a0a!important}input[type=range].slider.is-black::-ms-track{background:#0a0a0a!important}input[type=range].slider.is-black::-ms-fill-lower{background:#0a0a0a}input[type=range].slider.is-black::-ms-fill-upper{background:#0a0a0a}input[type=range].slider.is-black .has-output-tooltip+output,input[type=range].slider.is-black.has-output+output{background-color:#0a0a0a;color:#fff}input[type=range].slider.is-light::-moz-range-track{background:#f5f5f5!important}input[type=range].slider.is-light::-webkit-slider-runnable-track{background:#f5f5f5!important}input[type=range].slider.is-light::-ms-track{background:#f5f5f5!important}input[type=range].slider.is-light::-ms-fill-lower{background:#f5f5f5}input[type=range].slider.is-light::-ms-fill-upper{background:#f5f5f5}input[type=range].slider.is-light .has-output-tooltip+output,input[type=range].slider.is-light.has-output+output{background-color:#f5f5f5;color:#363636}input[type=range].slider.is-dark::-moz-range-track{background:#363636!important}input[type=range].slider.is-dark::-webkit-slider-runnable-track{background:#363636!important}input[type=range].slider.is-dark::-ms-track{background:#363636!important}input[type=range].slider.is-dark::-ms-fill-lower{background:#363636}input[type=range].slider.is-dark::-ms-fill-upper{background:#363636}input[type=range].slider.is-dark .has-output-tooltip+output,input[type=range].slider.is-dark.has-output+output{background-color:#363636;color:#f5f5f5}input[type=range].slider.is-primary::-moz-range-track{background:#00d1b2!important}input[type=range].slider.is-primary::-webkit-slider-runnable-track{background:#00d1b2!important}input[type=range].slider.is-primary::-ms-track{background:#00d1b2!important}input[type=range].slider.is-primary::-ms-fill-lower{background:#00d1b2}input[type=range].slider.is-primary::-ms-fill-upper{background:#00d1b2}input[type=range].slider.is-primary .has-output-tooltip+output,input[type=range].slider.is-primary.has-output+output{background-color:#00d1b2;color:#fff}input[type=range].slider.is-link::-moz-range-track{background:#3273dc!important}input[type=range].slider.is-link::-webkit-slider-runnable-track{background:#3273dc!important}input[type=range].slider.is-link::-ms-track{background:#3273dc!important}input[type=range].slider.is-link::-ms-fill-lower{background:#3273dc}input[type=range].slider.is-link::-ms-fill-upper{background:#3273dc}input[type=range].slider.is-link .has-output-tooltip+output,input[type=range].slider.is-link.has-output+output{background-color:#3273dc;color:#fff}input[type=range].slider.is-info::-moz-range-track{background:#209cee!important}input[type=range].slider.is-info::-webkit-slider-runnable-track{background:#209cee!important}input[type=range].slider.is-info::-ms-track{background:#209cee!important}input[type=range].slider.is-info::-ms-fill-lower{background:#209cee}input[type=range].slider.is-info::-ms-fill-upper{background:#209cee}input[type=range].slider.is-info .has-output-tooltip+output,input[type=range].slider.is-info.has-output+output{background-color:#209cee;color:#fff}input[type=range].slider.is-success::-moz-range-track{background:#23d160!important}input[type=range].slider.is-success::-webkit-slider-runnable-track{background:#23d160!important}input[type=range].slider.is-success::-ms-track{background:#23d160!important}input[type=range].slider.is-success::-ms-fill-lower{background:#23d160}input[type=range].slider.is-success::-ms-fill-upper{background:#23d160}input[type=range].slider.is-success .has-output-tooltip+output,input[type=range].slider.is-success.has-output+output{background-color:#23d160;color:#fff}input[type=range].slider.is-warning::-moz-range-track{background:#ffdd57!important}input[type=range].slider.is-warning::-webkit-slider-runnable-track{background:#ffdd57!important}input[type=range].slider.is-warning::-ms-track{background:#ffdd57!important}input[type=range].slider.is-warning::-ms-fill-lower{background:#ffdd57}input[type=range].slider.is-warning::-ms-fill-upper{background:#ffdd57}input[type=range].slider.is-warning .has-output-tooltip+output,input[type=range].slider.is-warning.has-output+output{background-color:#ffdd57;color:rgba(0,0,0,.7)}input[type=range].slider.is-danger::-moz-range-track{background:#ff3860!important}input[type=range].slider.is-danger::-webkit-slider-runnable-track{background:#ff3860!important}input[type=range].slider.is-danger::-ms-track{background:#ff3860!important}input[type=range].slider.is-danger::-ms-fill-lower{background:#ff3860}input[type=range].slider.is-danger::-ms-fill-upper{background:#ff3860}input[type=range].slider.is-danger .has-output-tooltip+output,input[type=range].slider.is-danger.has-output+output{background-color:#ff3860;color:#fff} -------------------------------------------------------------------------------- /docs/static/css/custom.css: -------------------------------------------------------------------------------- 1 | 2 | .div-1 { 3 | background-color: rgb(255, 255, 255); 4 | } 5 | 6 | .div-2 { 7 | background-color: #eaeaea; 8 | } 9 | 10 | .div-3 { 11 | background-color: #FBD603; 12 | } 13 | 14 | .scroll-container { 15 | display: flex; 16 | overflow-x: auto; 17 | scroll-snap-type: x mandatory; 18 | gap: 12px; 19 | padding: 10px; 20 | scrollbar-width: 2px; 21 | height: 480px; 22 | } 23 | 24 | .scroll-item { 25 | flex: 0 0 auto; 26 | scroll-snap-type: center; 27 | align-items: center; 28 | justify-content: center; 29 | text-align: center; 30 | } 31 | 32 | 33 | .scroll-item img { 34 | display: inline-block; 35 | height: 350px; 36 | width: auto; 37 | border-radius: 6px; 38 | } 39 | 40 | .scroll-item .caption { 41 | display: block; 42 | max-width: 100%; 43 | margin-top: 8px; 44 | padding: 5px; 45 | border: 2px solid #ccc; 46 | border-radius: 5px; 47 | background-color: #f9f9f9; 48 | font-size: 18px; 49 | color: #333; 50 | word-wrap: break-word; 51 | word-break: break-word; 52 | white-space: normal; 53 | line-height:1.2; 54 | } 55 | 56 | .gallery-image { 57 | display: flex; 58 | flex-direction: column; 59 | justify-content: center; 60 | text-align: center; 61 | line-height: 200px; 62 | } 63 | 64 | 65 | .btn { 66 | border: 1px solid; 67 | background-color: #f8f9f9; 68 | font-size: 19px; 69 | padding: 2px 8px; 70 | border-radius:10px; 71 | } 72 | 73 | .one { 74 | color: #7E57C2; 75 | } 76 | 77 | .two { 78 | color: #FFAB40; 79 | } 80 | 81 | .three { 82 | color: #2980b9; 83 | } 84 | 85 | .btn:hover { 86 | color: white; 87 | border: 0; 88 | } 89 | 90 | .one:hover { 91 | background-color: #7E57C2; 92 | } 93 | 94 | .two:hover { 95 | background-color: #FFAB40; 96 | } 97 | 98 | .three:hover { 99 | background-color: #2980b9; 100 | } 101 | 102 | @keyframes change_text { 103 | 0% { 104 | opacity: 0; 105 | } 106 | 107 | 10% { 108 | opacity: 1; 109 | } 110 | } 111 | 112 | .choice_container { 113 | display: flex; 114 | align-items: center; 115 | gap: 10px; 116 | justify-content: center; 117 | } 118 | 119 | .image-list { 120 | margin-top: 35px; 121 | display: flex; 122 | flex-direction: column; 123 | gap: 10px; 124 | } 125 | 126 | .image-list img:hover { 127 | border-color: #007bff; 128 | } 129 | 130 | .image-list img { 131 | height: 80px; 132 | cursor: pointer; 133 | border: 2px solid #ccc; 134 | border-radius: 4px; 135 | } 136 | 137 | .selected-image img { 138 | height: 200px; 139 | border-radius: 4px; 140 | } 141 | 142 | .selected-image .caption { 143 | margin-top: 5px; 144 | padding: 5px; 145 | border-radius: 5px; 146 | font-size: 23px; 147 | font-weight: bold; 148 | } 149 | 150 | .new-view-container { 151 | width: 100%; 152 | height: 100%; 153 | border: 2px solid #ccc; 154 | border-radius: 4px; 155 | display: grid; 156 | grid-template-columns: repeat(3, 1fr); 157 | padding-left: 20px; 158 | padding-left: 5px; 159 | } 160 | 161 | .new-view-container img { 162 | width: 150px; 163 | height: 160px; 164 | border-radius: 4px; 165 | } 166 | 167 | 168 | .gif-container { 169 | margin-top: 45px; 170 | display: flex; 171 | flex-direction: row; 172 | gap: 80px; 173 | align-items: center; 174 | justify-content: center; 175 | } 176 | 177 | .gif-container img { 178 | width: 200px; 179 | height: auto; 180 | border-radius: 4px; 181 | } 182 | 183 | .gif-hover-image { 184 | display: block; 185 | border-radius: 4px; 186 | width: 100%; /* 图片宽度适配 */ 187 | height: auto; /* 保持宽高比 */ 188 | cursor: pointer; 189 | transition: all 0.3s ease; 190 | } 191 | 192 | .gif-hover-image:hover { 193 | transform: scale(1.05); 194 | } -------------------------------------------------------------------------------- /docs/static/css/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Noto Sans', sans-serif; 3 | } 4 | 5 | 6 | .footer .icon-link { 7 | font-size: 25px; 8 | color: #000; 9 | } 10 | 11 | .link-block a { 12 | margin-top: 5px; 13 | margin-bottom: 5px; 14 | } 15 | 16 | .dnerf { 17 | font-variant: small-caps; 18 | } 19 | 20 | 21 | .teaser .hero-body { 22 | padding-top: 0; 23 | padding-bottom: 3rem; 24 | } 25 | 26 | .teaser { 27 | font-family: 'Google Sans', sans-serif; 28 | } 29 | 30 | 31 | .publication-title { 32 | } 33 | 34 | .publication-banner { 35 | max-height: parent; 36 | 37 | } 38 | 39 | .publication-banner video { 40 | position: relative; 41 | left: auto; 42 | top: auto; 43 | transform: none; 44 | object-fit: fit; 45 | } 46 | 47 | .publication-header .hero-body { 48 | } 49 | 50 | /* @font-face { 51 | font-family: 'jackbrush'; 52 | src: url('static/font_style/jackbrush-alj9a.ttf'); 53 | } */ 54 | .model-name { 55 | font-family: 'Google Sans', sans-serif; 56 | font-weight: bold; 57 | color: white; 58 | font: xx-large; 59 | } 60 | 61 | 62 | .publication-title { 63 | font-family: 'Google Sans', sans-serif; 64 | font-weight: bold; 65 | color: white; 66 | font: xx-large; 67 | } 68 | 69 | .publication-authors { 70 | font-family: 'Google Sans', sans-serif; 71 | } 72 | 73 | .publication-venue { 74 | color: #555; 75 | width: fit-content; 76 | font-weight: bold; 77 | } 78 | 79 | .publication-awards { 80 | color: #ff3860; 81 | width: fit-content; 82 | font-weight: bolder; 83 | } 84 | 85 | .publication-authors { 86 | } 87 | 88 | .publication-authors a { 89 | color: hsl(204, 86%, 78%) !important; 90 | } 91 | 92 | .publication-authors a:hover { 93 | text-decoration: underline; 94 | } 95 | 96 | .author-block { 97 | display: inline-block; 98 | font-size: larger; 99 | color: hsl(0, 0%, 88%) 100 | } 101 | 102 | .publication-banner img { 103 | } 104 | 105 | .publication-authors { 106 | /*color: #4286f4;*/ 107 | } 108 | 109 | .publication-video { 110 | position: relative; 111 | width: 100%; 112 | height: auto; 113 | 114 | overflow: hidden; 115 | border-radius: 10px !important; 116 | } 117 | 118 | .publication-video iframe { 119 | position: absolute; 120 | top: 0; 121 | left: 0; 122 | width: 100%; 123 | height: 100%; 124 | } 125 | 126 | .publication-body img { 127 | } 128 | 129 | .results-carousel { 130 | overflow: hidden; 131 | } 132 | 133 | .results-carousel .item { 134 | margin: 5px; 135 | overflow: hidden; 136 | border: 1px solid #bbb; 137 | border-radius: 10px; 138 | padding: 0; 139 | font-size: 0; 140 | } 141 | 142 | .results-carousel video { 143 | margin: 0; 144 | } 145 | 146 | 147 | .interpolation-panel { 148 | background: #f5f5f5; 149 | border-radius: 10px; 150 | } 151 | 152 | .interpolation-panel .interpolation-image { 153 | width: 100%; 154 | border-radius: 5px; 155 | } 156 | 157 | .interpolation-video-column { 158 | } 159 | 160 | .interpolation-panel .slider { 161 | margin: 0 !important; 162 | } 163 | 164 | .interpolation-panel .slider { 165 | margin: 0 !important; 166 | } 167 | 168 | #interpolation-image-wrapper { 169 | width: 100%; 170 | } 171 | #interpolation-image-wrapper img { 172 | border-radius: 5px; 173 | } 174 | 175 | .video-compare-container { 176 | width: 63%; 177 | margin: 0 auto; 178 | position: relative; 179 | display: block; 180 | line-height: 0; 181 | } 182 | 183 | .video { 184 | width: 100%; 185 | height: auto; 186 | position: relative; 187 | top: 0; 188 | left: 0; 189 | } 190 | 191 | .videoMerge { 192 | position: relative; 193 | top: 0; 194 | left: 0; 195 | z-index: 10; 196 | width: 100%; 197 | display: block; 198 | margin: 0 auto; 199 | background-size: cover; 200 | } 201 | -------------------------------------------------------------------------------- /docs/static/font_style/jackbrush-alj9a.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/font_style/jackbrush-alj9a.ttf -------------------------------------------------------------------------------- /docs/static/images/background.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/background.jpg -------------------------------------------------------------------------------- /docs/static/images/huggingface_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 11 | 15 | 19 | 25 | 29 | 33 | 37 | 41 | 42 | -------------------------------------------------------------------------------- /docs/static/images/i23d/1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/1-1.png -------------------------------------------------------------------------------- /docs/static/images/i23d/1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/1-2.png -------------------------------------------------------------------------------- /docs/static/images/i23d/1-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/1-3.png -------------------------------------------------------------------------------- /docs/static/images/i23d/1-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/1-4.png -------------------------------------------------------------------------------- /docs/static/images/i23d/1-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/1-5.png -------------------------------------------------------------------------------- /docs/static/images/i23d/1-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/1-6.png -------------------------------------------------------------------------------- /docs/static/images/i23d/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/1.png -------------------------------------------------------------------------------- /docs/static/images/i23d/2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/2-1.png -------------------------------------------------------------------------------- /docs/static/images/i23d/2-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/2-2.png -------------------------------------------------------------------------------- /docs/static/images/i23d/2-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/2-3.png -------------------------------------------------------------------------------- /docs/static/images/i23d/2-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/2-4.png -------------------------------------------------------------------------------- /docs/static/images/i23d/2-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/2-5.png -------------------------------------------------------------------------------- /docs/static/images/i23d/2-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/2-6.png -------------------------------------------------------------------------------- /docs/static/images/i23d/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/2.png -------------------------------------------------------------------------------- /docs/static/images/i23d/3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/3-1.png -------------------------------------------------------------------------------- /docs/static/images/i23d/3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/3-2.png -------------------------------------------------------------------------------- /docs/static/images/i23d/3-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/3-3.png -------------------------------------------------------------------------------- /docs/static/images/i23d/3-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/3-4.png -------------------------------------------------------------------------------- /docs/static/images/i23d/3-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/3-5.png -------------------------------------------------------------------------------- /docs/static/images/i23d/3-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/3-6.png -------------------------------------------------------------------------------- /docs/static/images/i23d/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/3.png -------------------------------------------------------------------------------- /docs/static/images/i23d/4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/4-1.png -------------------------------------------------------------------------------- /docs/static/images/i23d/4-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/4-2.png -------------------------------------------------------------------------------- /docs/static/images/i23d/4-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/4-3.png -------------------------------------------------------------------------------- /docs/static/images/i23d/4-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/4-4.png -------------------------------------------------------------------------------- /docs/static/images/i23d/4-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/4-5.png -------------------------------------------------------------------------------- /docs/static/images/i23d/4-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/4-6.png -------------------------------------------------------------------------------- /docs/static/images/i23d/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i23d/4.png -------------------------------------------------------------------------------- /docs/static/images/i2v/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/.DS_Store -------------------------------------------------------------------------------- /docs/static/images/i2v/boat/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/boat/.DS_Store -------------------------------------------------------------------------------- /docs/static/images/i2v/boat/boat.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/boat/boat.gif -------------------------------------------------------------------------------- /docs/static/images/i2v/boat/boat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/boat/boat.png -------------------------------------------------------------------------------- /docs/static/images/i2v/cake/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/cake/.DS_Store -------------------------------------------------------------------------------- /docs/static/images/i2v/cake/cake.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/cake/cake.gif -------------------------------------------------------------------------------- /docs/static/images/i2v/cake/cake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/cake/cake.png -------------------------------------------------------------------------------- /docs/static/images/i2v/cloud/cloud.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/cloud/cloud.gif -------------------------------------------------------------------------------- /docs/static/images/i2v/cloud/cloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/cloud/cloud.png -------------------------------------------------------------------------------- /docs/static/images/i2v/fire/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/fire/.DS_Store -------------------------------------------------------------------------------- /docs/static/images/i2v/fire/fire.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/fire/fire.gif -------------------------------------------------------------------------------- /docs/static/images/i2v/fire/fire.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/fire/fire.png -------------------------------------------------------------------------------- /docs/static/images/i2v/meteor/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/meteor/.DS_Store -------------------------------------------------------------------------------- /docs/static/images/i2v/meteor/meteor.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/meteor/meteor.gif -------------------------------------------------------------------------------- /docs/static/images/i2v/meteor/meteor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/meteor/meteor.png -------------------------------------------------------------------------------- /docs/static/images/i2v/waterfall/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/waterfall/.DS_Store -------------------------------------------------------------------------------- /docs/static/images/i2v/waterfall/waterfall.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/waterfall/waterfall.gif -------------------------------------------------------------------------------- /docs/static/images/i2v/waterfall/waterfall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/i2v/waterfall/waterfall.png -------------------------------------------------------------------------------- /docs/static/images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/icon.png -------------------------------------------------------------------------------- /docs/static/images/interpolation/car/end.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/interpolation/car/end.png -------------------------------------------------------------------------------- /docs/static/images/interpolation/car/interpolation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/interpolation/car/interpolation.gif -------------------------------------------------------------------------------- /docs/static/images/interpolation/car/interpolation.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/interpolation/car/interpolation.mp4 -------------------------------------------------------------------------------- /docs/static/images/interpolation/car/start.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/interpolation/car/start.png -------------------------------------------------------------------------------- /docs/static/images/interpolation/cat_tiger/end.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/interpolation/cat_tiger/end.png -------------------------------------------------------------------------------- /docs/static/images/interpolation/cat_tiger/interpolation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/interpolation/cat_tiger/interpolation.gif -------------------------------------------------------------------------------- /docs/static/images/interpolation/cat_tiger/interpolation.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/interpolation/cat_tiger/interpolation.mp4 -------------------------------------------------------------------------------- /docs/static/images/interpolation/cat_tiger/start.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/interpolation/cat_tiger/start.png -------------------------------------------------------------------------------- /docs/static/images/logo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/logo.gif -------------------------------------------------------------------------------- /docs/static/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/logo.png -------------------------------------------------------------------------------- /docs/static/images/method/method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/method/method.png -------------------------------------------------------------------------------- /docs/static/images/t2i/A baby rabbit wearing a tiny knitted hat, ultra-detailed, photorealistic.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A baby rabbit wearing a tiny knitted hat, ultra-detailed, photorealistic.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/A bear with fur made of chocolate shavings, standing in a clearing filled with marshmallow mushrooms.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A bear with fur made of chocolate shavings, standing in a clearing filled with marshmallow mushrooms.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A breathtaking view of the Swiss Alps during sunrise, with snow-capped peaks and lush green valleys, ultra-realistic, high detail.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A breathtaking view of the Swiss Alps during sunrise, with snow-capped peaks and lush green valleys, ultra-realistic, high detail.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/A close-up of a sunlit butterfly resting on a flower in a garden.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A close-up of a sunlit butterfly resting on a flower in a garden.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A close-up of a vibrant, fully bloomed red rose with dew drops on its petals.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A close-up of a vibrant, fully bloomed red rose with dew drops on its petals.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A close-up photograph of a lion with its mane blowing in the wind against the savanna backdrop.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A close-up photograph of a lion with its mane blowing in the wind against the savanna backdrop.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A curious dolphin leaping out of the water, creating splashes in the sunlight.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A curious dolphin leaping out of the water, creating splashes in the sunlight.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A cyborg superhero with a robotic arm and high-tech gadgets, standing atop a skyscraper.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A cyborg superhero with a robotic arm and high-tech gadgets, standing atop a skyscraper.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/A detailed close-up of a rusted vintage car abandoned in an overgrown field.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A detailed close-up of a rusted vintage car abandoned in an overgrown field.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/A dragon made of molten chocolate, with scales that glisten like gold leaf and eyes of crystalline sugar.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A dragon made of molten chocolate, with scales that glisten like gold leaf and eyes of crystalline sugar.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A dramatic mountain range during a thunderstorm, with dark clouds, lightning strikes, and rugged terrain.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A dramatic mountain range during a thunderstorm, with dark clouds, lightning strikes, and rugged terrain.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/A gorilla wearing an advanced robotic suit with pulsating energy cores, standing on the edge of a futuristic skyline.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A gorilla wearing an advanced robotic suit with pulsating energy cores, standing on the edge of a futuristic skyline.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A group of astronauts standing on the surface of Mars, with Earth visible in the distant sky.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A group of astronauts standing on the surface of Mars, with Earth visible in the distant sky.png -------------------------------------------------------------------------------- /docs/static/images/t2i/A hippopotamus with a body of jelly-like translucent gelatin, lounging in a pool of liquid sherbet.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A hippopotamus with a body of jelly-like translucent gelatin, lounging in a pool of liquid sherbet.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A lion made entirely of layered caramel and chocolate, with a mane composed of spun sugar flames.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A lion made entirely of layered caramel and chocolate, with a mane composed of spun sugar flames.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A lion with a mane made of holographic flames, standing on a crystal platform in a neon-lit jungle.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A lion with a mane made of holographic flames, standing on a crystal platform in a neon-lit jungle.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A magical forest with glowing plants, where a young anime girl with long hair discovers a hidden portal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A magical forest with glowing plants, where a young anime girl with long hair discovers a hidden portal.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/A majestic bald eagle soaring over a snowy mountain range.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A majestic bald eagle soaring over a snowy mountain range.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A majestic mountain range under a starry sky, with swirling clouds and glowing moonlight, inspired by Van Gogh.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A majestic mountain range under a starry sky, with swirling clouds and glowing moonlight, inspired by Van Gogh.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A peaceful forest in autumn, with golden leaves falling and a stream running through it, illuminated by soft sunlight.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A peaceful forest in autumn, with golden leaves falling and a stream running through it, illuminated by soft sunlight.png -------------------------------------------------------------------------------- /docs/static/images/t2i/A peaceful mountain lake reflecting the surrounding pine trees and snowy peaks, photorealistic, tranquil.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A peaceful mountain lake reflecting the surrounding pine trees and snowy peaks, photorealistic, tranquil.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/A phoenix-like bird with wings made of fiery red fruit leather and a beak of candied citrus peel.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A phoenix-like bird with wings made of fiery red fruit leather and a beak of candied citrus peel.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A realistic photograph of a wolf howling at the moon in a snowy forest.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A realistic photograph of a wolf howling at the moon in a snowy forest.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/A rustic bedroom showcasing a round bed, earth-toned decor, and a cluttered, yet charming ambiance.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A rustic bedroom showcasing a round bed, earth-toned decor, and a cluttered, yet charming ambiance.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/A stealthy ninja superhero in a dark alley, showcasing agility and advanced technology.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A stealthy ninja superhero in a dark alley, showcasing agility and advanced technology.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/A wolf constructed from layers of dark chocolate and nougat, with glowing eyes made of candied cherries.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/A wolf constructed from layers of dark chocolate and nougat, with glowing eyes made of candied cherries.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/An owl constructed from layers of caramel popcorn and hazelnut chocolate, perched on a pretzel branch.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/An owl constructed from layers of caramel popcorn and hazelnut chocolate, perched on a pretzel branch.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/Bentley Bacalar driving on black tarmac road with trees in background, Sumi-e drawing, white background 8k.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/Bentley Bacalar driving on black tarmac road with trees in background, Sumi-e drawing, white background 8k.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/Documentary-style photography of a bustling marketplace in Marrakech, with spices and textiles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/Documentary-style photography of a bustling marketplace in Marrakech, with spices and textiles.png -------------------------------------------------------------------------------- /docs/static/images/t2i/Kraken is listening to music with headphones.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/Kraken is listening to music with headphones.png -------------------------------------------------------------------------------- /docs/static/images/t2i/Post-Apocalyptic Wanderer, character design, style by kim jung gi, zabrocki, karlkka, jayison devadas, 8k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/Post-Apocalyptic Wanderer, character design, style by kim jung gi, zabrocki, karlkka, jayison devadas, 8k.png -------------------------------------------------------------------------------- /docs/static/images/t2i/The picture shows a cute little tiger, wearing a blue hoodie and hat, sitting on a small cardboard boat on calm water.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/The picture shows a cute little tiger, wearing a blue hoodie and hat, sitting on a small cardboard boat on calm water.png -------------------------------------------------------------------------------- /docs/static/images/t2i/Two baby ducks swimming in a pond at sunset, highly detailed, hyper-realistic.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/Two baby ducks swimming in a pond at sunset, highly detailed, hyper-realistic.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/Two female rabbit adventurers dressed in a fancy velvet coats next to a Christmas tree, Christmas theme, on an antique opulent background , jean - baptiste monge , smooth, anthropomorphic photorealistic, photography, lifelike, high resolution, smooth.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/Two female rabbit adventurers dressed in a fancy velvet coats next to a Christmas tree, Christmas theme, on an antique opulent background , jean - baptiste monge , smooth, anthropomorphic photorealistic, photography, lifelike, high resolution, smooth.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/beautiful lady,freckles, big smile,blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light,dark grey background.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/beautiful lady,freckles, big smile,blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light,dark grey background.jpeg -------------------------------------------------------------------------------- /docs/static/images/t2i/cloud.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/cloud.jpg -------------------------------------------------------------------------------- /docs/static/images/t2i/golden sunset shines on the top of snow-capped mountains, with small villages at its foot and surrounding buildings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ant-research/lumos/3076dfee3128613c631b57cc20b52e6bbb61dc9f/docs/static/images/t2i/golden sunset shines on the top of snow-capped mountains, with small villages at its foot and surrounding buildings.png -------------------------------------------------------------------------------- /docs/static/js/bulma-slider.js: -------------------------------------------------------------------------------- 1 | (function webpackUniversalModuleDefinition(root, factory) { 2 | if(typeof exports === 'object' && typeof module === 'object') 3 | module.exports = factory(); 4 | else if(typeof define === 'function' && define.amd) 5 | define([], factory); 6 | else if(typeof exports === 'object') 7 | exports["bulmaSlider"] = factory(); 8 | else 9 | root["bulmaSlider"] = factory(); 10 | })(typeof self !== 'undefined' ? self : this, function() { 11 | return /******/ (function(modules) { // webpackBootstrap 12 | /******/ // The module cache 13 | /******/ var installedModules = {}; 14 | /******/ 15 | /******/ // The require function 16 | /******/ function __webpack_require__(moduleId) { 17 | /******/ 18 | /******/ // Check if module is in cache 19 | /******/ if(installedModules[moduleId]) { 20 | /******/ return installedModules[moduleId].exports; 21 | /******/ } 22 | /******/ // Create a new module (and put it into the cache) 23 | /******/ var module = installedModules[moduleId] = { 24 | /******/ i: moduleId, 25 | /******/ l: false, 26 | /******/ exports: {} 27 | /******/ }; 28 | /******/ 29 | /******/ // Execute the module function 30 | /******/ modules[moduleId].call(module.exports, module, module.exports, __webpack_require__); 31 | /******/ 32 | /******/ // Flag the module as loaded 33 | /******/ module.l = true; 34 | /******/ 35 | /******/ // Return the exports of the module 36 | /******/ return module.exports; 37 | /******/ } 38 | /******/ 39 | /******/ 40 | /******/ // expose the modules object (__webpack_modules__) 41 | /******/ __webpack_require__.m = modules; 42 | /******/ 43 | /******/ // expose the module cache 44 | /******/ __webpack_require__.c = installedModules; 45 | /******/ 46 | /******/ // define getter function for harmony exports 47 | /******/ __webpack_require__.d = function(exports, name, getter) { 48 | /******/ if(!__webpack_require__.o(exports, name)) { 49 | /******/ Object.defineProperty(exports, name, { 50 | /******/ configurable: false, 51 | /******/ enumerable: true, 52 | /******/ get: getter 53 | /******/ }); 54 | /******/ } 55 | /******/ }; 56 | /******/ 57 | /******/ // getDefaultExport function for compatibility with non-harmony modules 58 | /******/ __webpack_require__.n = function(module) { 59 | /******/ var getter = module && module.__esModule ? 60 | /******/ function getDefault() { return module['default']; } : 61 | /******/ function getModuleExports() { return module; }; 62 | /******/ __webpack_require__.d(getter, 'a', getter); 63 | /******/ return getter; 64 | /******/ }; 65 | /******/ 66 | /******/ // Object.prototype.hasOwnProperty.call 67 | /******/ __webpack_require__.o = function(object, property) { return Object.prototype.hasOwnProperty.call(object, property); }; 68 | /******/ 69 | /******/ // __webpack_public_path__ 70 | /******/ __webpack_require__.p = ""; 71 | /******/ 72 | /******/ // Load entry module and return exports 73 | /******/ return __webpack_require__(__webpack_require__.s = 0); 74 | /******/ }) 75 | /************************************************************************/ 76 | /******/ ([ 77 | /* 0 */ 78 | /***/ (function(module, __webpack_exports__, __webpack_require__) { 79 | 80 | "use strict"; 81 | Object.defineProperty(__webpack_exports__, "__esModule", { value: true }); 82 | /* harmony export (binding) */ __webpack_require__.d(__webpack_exports__, "isString", function() { return isString; }); 83 | /* harmony import */ var __WEBPACK_IMPORTED_MODULE_0__events__ = __webpack_require__(1); 84 | var _extends = Object.assign || function (target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i]; for (var key in source) { if (Object.prototype.hasOwnProperty.call(source, key)) { target[key] = source[key]; } } } return target; }; 85 | 86 | var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }(); 87 | 88 | var _typeof = typeof Symbol === "function" && typeof Symbol.iterator === "symbol" ? function (obj) { return typeof obj; } : function (obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; 89 | 90 | function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } 91 | 92 | function _possibleConstructorReturn(self, call) { if (!self) { throw new ReferenceError("this hasn't been initialised - super() hasn't been called"); } return call && (typeof call === "object" || typeof call === "function") ? call : self; } 93 | 94 | function _inherits(subClass, superClass) { if (typeof superClass !== "function" && superClass !== null) { throw new TypeError("Super expression must either be null or a function, not " + typeof superClass); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, enumerable: false, writable: true, configurable: true } }); if (superClass) Object.setPrototypeOf ? Object.setPrototypeOf(subClass, superClass) : subClass.__proto__ = superClass; } 95 | 96 | 97 | 98 | var isString = function isString(unknown) { 99 | return typeof unknown === 'string' || !!unknown && (typeof unknown === 'undefined' ? 'undefined' : _typeof(unknown)) === 'object' && Object.prototype.toString.call(unknown) === '[object String]'; 100 | }; 101 | 102 | var bulmaSlider = function (_EventEmitter) { 103 | _inherits(bulmaSlider, _EventEmitter); 104 | 105 | function bulmaSlider(selector) { 106 | var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}; 107 | 108 | _classCallCheck(this, bulmaSlider); 109 | 110 | var _this = _possibleConstructorReturn(this, (bulmaSlider.__proto__ || Object.getPrototypeOf(bulmaSlider)).call(this)); 111 | 112 | _this.element = typeof selector === 'string' ? document.querySelector(selector) : selector; 113 | // An invalid selector or non-DOM node has been provided. 114 | if (!_this.element) { 115 | throw new Error('An invalid selector or non-DOM node has been provided.'); 116 | } 117 | 118 | _this._clickEvents = ['click']; 119 | /// Set default options and merge with instance defined 120 | _this.options = _extends({}, options); 121 | 122 | _this.onSliderInput = _this.onSliderInput.bind(_this); 123 | 124 | _this.init(); 125 | return _this; 126 | } 127 | 128 | /** 129 | * Initiate all DOM element containing selector 130 | * @method 131 | * @return {Array} Array of all slider instances 132 | */ 133 | 134 | 135 | _createClass(bulmaSlider, [{ 136 | key: 'init', 137 | 138 | 139 | /** 140 | * Initiate plugin 141 | * @method init 142 | * @return {void} 143 | */ 144 | value: function init() { 145 | this._id = 'bulmaSlider' + new Date().getTime() + Math.floor(Math.random() * Math.floor(9999)); 146 | this.output = this._findOutputForSlider(); 147 | 148 | this._bindEvents(); 149 | 150 | if (this.output) { 151 | if (this.element.classList.contains('has-output-tooltip')) { 152 | // Get new output position 153 | var newPosition = this._getSliderOutputPosition(); 154 | 155 | // Set output position 156 | this.output.style['left'] = newPosition.position; 157 | } 158 | } 159 | 160 | this.emit('bulmaslider:ready', this.element.value); 161 | } 162 | }, { 163 | key: '_findOutputForSlider', 164 | value: function _findOutputForSlider() { 165 | var _this2 = this; 166 | 167 | var result = null; 168 | var outputs = document.getElementsByTagName('output') || []; 169 | 170 | Array.from(outputs).forEach(function (output) { 171 | if (output.htmlFor == _this2.element.getAttribute('id')) { 172 | result = output; 173 | return true; 174 | } 175 | }); 176 | return result; 177 | } 178 | }, { 179 | key: '_getSliderOutputPosition', 180 | value: function _getSliderOutputPosition() { 181 | // Update output position 182 | var newPlace, minValue; 183 | 184 | var style = window.getComputedStyle(this.element, null); 185 | // Measure width of range input 186 | var sliderWidth = parseInt(style.getPropertyValue('width'), 10); 187 | 188 | // Figure out placement percentage between left and right of input 189 | if (!this.element.getAttribute('min')) { 190 | minValue = 0; 191 | } else { 192 | minValue = this.element.getAttribute('min'); 193 | } 194 | var newPoint = (this.element.value - minValue) / (this.element.getAttribute('max') - minValue); 195 | 196 | // Prevent bubble from going beyond left or right (unsupported browsers) 197 | if (newPoint < 0) { 198 | newPlace = 0; 199 | } else if (newPoint > 1) { 200 | newPlace = sliderWidth; 201 | } else { 202 | newPlace = sliderWidth * newPoint; 203 | } 204 | 205 | return { 206 | 'position': newPlace + 'px' 207 | }; 208 | } 209 | 210 | /** 211 | * Bind all events 212 | * @method _bindEvents 213 | * @return {void} 214 | */ 215 | 216 | }, { 217 | key: '_bindEvents', 218 | value: function _bindEvents() { 219 | if (this.output) { 220 | // Add event listener to update output when slider value change 221 | this.element.addEventListener('input', this.onSliderInput, false); 222 | } 223 | } 224 | }, { 225 | key: 'onSliderInput', 226 | value: function onSliderInput(e) { 227 | e.preventDefault(); 228 | 229 | if (this.element.classList.contains('has-output-tooltip')) { 230 | // Get new output position 231 | var newPosition = this._getSliderOutputPosition(); 232 | 233 | // Set output position 234 | this.output.style['left'] = newPosition.position; 235 | } 236 | 237 | // Check for prefix and postfix 238 | var prefix = this.output.hasAttribute('data-prefix') ? this.output.getAttribute('data-prefix') : ''; 239 | var postfix = this.output.hasAttribute('data-postfix') ? this.output.getAttribute('data-postfix') : ''; 240 | 241 | // Update output with slider value 242 | this.output.value = prefix + this.element.value + postfix; 243 | 244 | this.emit('bulmaslider:ready', this.element.value); 245 | } 246 | }], [{ 247 | key: 'attach', 248 | value: function attach() { 249 | var _this3 = this; 250 | 251 | var selector = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 'input[type="range"].slider'; 252 | var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}; 253 | 254 | var instances = new Array(); 255 | 256 | var elements = isString(selector) ? document.querySelectorAll(selector) : Array.isArray(selector) ? selector : [selector]; 257 | elements.forEach(function (element) { 258 | if (typeof element[_this3.constructor.name] === 'undefined') { 259 | var instance = new bulmaSlider(element, options); 260 | element[_this3.constructor.name] = instance; 261 | instances.push(instance); 262 | } else { 263 | instances.push(element[_this3.constructor.name]); 264 | } 265 | }); 266 | 267 | return instances; 268 | } 269 | }]); 270 | 271 | return bulmaSlider; 272 | }(__WEBPACK_IMPORTED_MODULE_0__events__["a" /* default */]); 273 | 274 | /* harmony default export */ __webpack_exports__["default"] = (bulmaSlider); 275 | 276 | /***/ }), 277 | /* 1 */ 278 | /***/ (function(module, __webpack_exports__, __webpack_require__) { 279 | 280 | "use strict"; 281 | var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }(); 282 | 283 | function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } 284 | 285 | var EventEmitter = function () { 286 | function EventEmitter() { 287 | var listeners = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : []; 288 | 289 | _classCallCheck(this, EventEmitter); 290 | 291 | this._listeners = new Map(listeners); 292 | this._middlewares = new Map(); 293 | } 294 | 295 | _createClass(EventEmitter, [{ 296 | key: "listenerCount", 297 | value: function listenerCount(eventName) { 298 | if (!this._listeners.has(eventName)) { 299 | return 0; 300 | } 301 | 302 | var eventListeners = this._listeners.get(eventName); 303 | return eventListeners.length; 304 | } 305 | }, { 306 | key: "removeListeners", 307 | value: function removeListeners() { 308 | var _this = this; 309 | 310 | var eventName = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : null; 311 | var middleware = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false; 312 | 313 | if (eventName !== null) { 314 | if (Array.isArray(eventName)) { 315 | name.forEach(function (e) { 316 | return _this.removeListeners(e, middleware); 317 | }); 318 | } else { 319 | this._listeners.delete(eventName); 320 | 321 | if (middleware) { 322 | this.removeMiddleware(eventName); 323 | } 324 | } 325 | } else { 326 | this._listeners = new Map(); 327 | } 328 | } 329 | }, { 330 | key: "middleware", 331 | value: function middleware(eventName, fn) { 332 | var _this2 = this; 333 | 334 | if (Array.isArray(eventName)) { 335 | name.forEach(function (e) { 336 | return _this2.middleware(e, fn); 337 | }); 338 | } else { 339 | if (!Array.isArray(this._middlewares.get(eventName))) { 340 | this._middlewares.set(eventName, []); 341 | } 342 | 343 | this._middlewares.get(eventName).push(fn); 344 | } 345 | } 346 | }, { 347 | key: "removeMiddleware", 348 | value: function removeMiddleware() { 349 | var _this3 = this; 350 | 351 | var eventName = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : null; 352 | 353 | if (eventName !== null) { 354 | if (Array.isArray(eventName)) { 355 | name.forEach(function (e) { 356 | return _this3.removeMiddleware(e); 357 | }); 358 | } else { 359 | this._middlewares.delete(eventName); 360 | } 361 | } else { 362 | this._middlewares = new Map(); 363 | } 364 | } 365 | }, { 366 | key: "on", 367 | value: function on(name, callback) { 368 | var _this4 = this; 369 | 370 | var once = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false; 371 | 372 | if (Array.isArray(name)) { 373 | name.forEach(function (e) { 374 | return _this4.on(e, callback); 375 | }); 376 | } else { 377 | name = name.toString(); 378 | var split = name.split(/,|, | /); 379 | 380 | if (split.length > 1) { 381 | split.forEach(function (e) { 382 | return _this4.on(e, callback); 383 | }); 384 | } else { 385 | if (!Array.isArray(this._listeners.get(name))) { 386 | this._listeners.set(name, []); 387 | } 388 | 389 | this._listeners.get(name).push({ once: once, callback: callback }); 390 | } 391 | } 392 | } 393 | }, { 394 | key: "once", 395 | value: function once(name, callback) { 396 | this.on(name, callback, true); 397 | } 398 | }, { 399 | key: "emit", 400 | value: function emit(name, data) { 401 | var _this5 = this; 402 | 403 | var silent = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false; 404 | 405 | name = name.toString(); 406 | var listeners = this._listeners.get(name); 407 | var middlewares = null; 408 | var doneCount = 0; 409 | var execute = silent; 410 | 411 | if (Array.isArray(listeners)) { 412 | listeners.forEach(function (listener, index) { 413 | // Start Middleware checks unless we're doing a silent emit 414 | if (!silent) { 415 | middlewares = _this5._middlewares.get(name); 416 | // Check and execute Middleware 417 | if (Array.isArray(middlewares)) { 418 | middlewares.forEach(function (middleware) { 419 | middleware(data, function () { 420 | var newData = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : null; 421 | 422 | if (newData !== null) { 423 | data = newData; 424 | } 425 | doneCount++; 426 | }, name); 427 | }); 428 | 429 | if (doneCount >= middlewares.length) { 430 | execute = true; 431 | } 432 | } else { 433 | execute = true; 434 | } 435 | } 436 | 437 | // If Middleware checks have been passed, execute 438 | if (execute) { 439 | if (listener.once) { 440 | listeners[index] = null; 441 | } 442 | listener.callback(data); 443 | } 444 | }); 445 | 446 | // Dirty way of removing used Events 447 | while (listeners.indexOf(null) !== -1) { 448 | listeners.splice(listeners.indexOf(null), 1); 449 | } 450 | } 451 | } 452 | }]); 453 | 454 | return EventEmitter; 455 | }(); 456 | 457 | /* harmony default export */ __webpack_exports__["a"] = (EventEmitter); 458 | 459 | /***/ }) 460 | /******/ ])["default"]; 461 | }); -------------------------------------------------------------------------------- /docs/static/js/bulma-slider.min.js: -------------------------------------------------------------------------------- 1 | !function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.bulmaSlider=e():t.bulmaSlider=e()}("undefined"!=typeof self?self:this,function(){return function(n){var r={};function i(t){if(r[t])return r[t].exports;var e=r[t]={i:t,l:!1,exports:{}};return n[t].call(e.exports,e,e.exports,i),e.l=!0,e.exports}return i.m=n,i.c=r,i.d=function(t,e,n){i.o(t,e)||Object.defineProperty(t,e,{configurable:!1,enumerable:!0,get:n})},i.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return i.d(e,"a",e),e},i.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},i.p="",i(i.s=0)}([function(t,e,n){"use strict";Object.defineProperty(e,"__esModule",{value:!0}),n.d(e,"isString",function(){return l});var r=n(1),i=Object.assign||function(t){for(var e=1;e=l.length&&(s=!0)):s=!0),s&&(t.once&&(u[e]=null),t.callback(r))});-1!==u.indexOf(null);)u.splice(u.indexOf(null),1)}}]),e}();e.a=i}]).default}); -------------------------------------------------------------------------------- /docs/static/js/choose_image.js: -------------------------------------------------------------------------------- 1 | 2 | const thumbnails = document.querySelectorAll('.image-list img'); 3 | const selectedImage = document.querySelector('.selected-image img'); 4 | const NewViewImages = document.querySelectorAll('.new-view-container img'); 5 | 6 | const NewViewSets = [ 7 | [ 8 | 'static/images/i23d/1-1.png', 'static/images/i23d/1-2.png', 'static/images/i23d/1-3.png', 9 | 'static/images/i23d/1-4.png', 'static/images/i23d/1-5.png', 'static/images/i23d/1-6.png', 10 | ], 11 | [ 12 | 'static/images/i23d/2-1.png', 'static/images/i23d/2-2.png', 'static/images/i23d/2-3.png', 13 | 'static/images/i23d/2-4.png', 'static/images/i23d/2-5.png', 'static/images/i23d/2-6.png', 14 | ], 15 | [ 16 | 'static/images/i23d/3-1.png', 'static/images/i23d/3-2.png', 'static/images/i23d/3-3.png', 17 | 'static/images/i23d/3-4.png', 'static/images/i23d/3-5.png', 'static/images/i23d/3-6.png', 18 | ], 19 | [ 20 | 'static/images/i23d/4-1.png', 'static/images/i23d/4-2.png', 'static/images/i23d/4-3.png', 21 | 'static/images/i23d/4-4.png', 'static/images/i23d/4-5.png', 'static/images/i23d/4-6.png', 22 | ] 23 | ]; 24 | 25 | 26 | thumbnails.forEach(thumbnail => { 27 | thumbnail.addEventListener('click', () => { 28 | const largeSrc = thumbnail.getAttribute('data-large'); 29 | 30 | selectedImage.src = largeSrc; 31 | 32 | thumbnails.forEach(img => img.classList.remove('selected')); 33 | 34 | thumbnail.classList.add('selected'); 35 | 36 | if (largeSrc=="static/images/i23d/1.png"){ 37 | NewViewImages.forEach((img, index)=>{ 38 | img.src = NewViewSets[0][index] 39 | }); 40 | } else if (largeSrc=="static/images/i23d/2.png"){ 41 | NewViewImages.forEach((img, index)=>{ 42 | img.src = NewViewSets[1][index] 43 | }); 44 | } else if (largeSrc=="static/images/i23d/3.png"){ 45 | const new_view_images = NewViewSets[2]; 46 | NewViewImages.forEach((img, index)=>{ 47 | img.src = new_view_images[index] 48 | }); 49 | } else if (largeSrc=="static/images/i23d/4.png"){ 50 | const new_view_images = NewViewSets[3]; 51 | NewViewImages.forEach((img, index)=>{ 52 | img.src = new_view_images[index] 53 | }); 54 | } 55 | 56 | }); 57 | }); 58 | -------------------------------------------------------------------------------- /docs/static/js/image2gif.js: -------------------------------------------------------------------------------- 1 | document.querySelectorAll('.gif-hover-image').forEach(img => { 2 | const originalSrc = img.src; // 保存原始静态图片路径 3 | const gifSrc = img.dataset.gif; // 获取 data-gif 的路径 4 | 5 | // 鼠标悬浮事件 6 | img.addEventListener('mouseover', () => { 7 | img.src = gifSrc; // 切换为对应 GIF 8 | }); 9 | 10 | // 鼠标移开事件 11 | img.addEventListener('mouseout', () => { 12 | img.src = originalSrc; // 恢复为静态图片 13 | }); 14 | }); 15 | -------------------------------------------------------------------------------- /docs/static/js/index.js: -------------------------------------------------------------------------------- 1 | window.HELP_IMPROVE_VIDEOJS = false; 2 | 3 | var INTERP_BASE = "./static/interpolation/stacked"; 4 | var NUM_INTERP_FRAMES = 240; 5 | 6 | var interp_images = []; 7 | function preloadInterpolationImages() { 8 | for (var i = 0; i < NUM_INTERP_FRAMES; i++) { 9 | var path = INTERP_BASE + '/' + String(i).padStart(6, '0') + '.jpg'; 10 | interp_images[i] = new Image(); 11 | interp_images[i].src = path; 12 | } 13 | } 14 | 15 | function setInterpolationImage(i) { 16 | var image = interp_images[i]; 17 | image.ondragstart = function() { return false; }; 18 | image.oncontextmenu = function() { return false; }; 19 | $('#interpolation-image-wrapper').empty().append(image); 20 | } 21 | 22 | 23 | $(document).ready(function() { 24 | // Check for click events on the navbar burger icon 25 | $(".navbar-burger").click(function() { 26 | // Toggle the "is-active" class on both the "navbar-burger" and the "navbar-menu" 27 | $(".navbar-burger").toggleClass("is-active"); 28 | $(".navbar-menu").toggleClass("is-active"); 29 | 30 | }); 31 | 32 | var options = { 33 | slidesToScroll: 1, 34 | slidesToShow: 3, 35 | loop: true, 36 | infinite: true, 37 | autoplay: false, 38 | autoplaySpeed: 3000, 39 | } 40 | 41 | // Initialize all div with carousel class 42 | var carousels = bulmaCarousel.attach('.carousel', options); 43 | 44 | // Loop on each carousel initialized 45 | for(var i = 0; i < carousels.length; i++) { 46 | // Add listener to event 47 | carousels[i].on('before:show', state => { 48 | console.log(state); 49 | }); 50 | } 51 | 52 | // Access to bulmaCarousel instance of an element 53 | var element = document.querySelector('#my-element'); 54 | if (element && element.bulmaCarousel) { 55 | // bulmaCarousel instance is available as element.bulmaCarousel 56 | element.bulmaCarousel.on('before-show', function(state) { 57 | console.log(state); 58 | }); 59 | } 60 | 61 | /*var player = document.getElementById('interpolation-video'); 62 | player.addEventListener('loadedmetadata', function() { 63 | $('#interpolation-slider').on('input', function(event) { 64 | console.log(this.value, player.duration); 65 | player.currentTime = player.duration / 100 * this.value; 66 | }) 67 | }, false);*/ 68 | preloadInterpolationImages(); 69 | 70 | $('#interpolation-slider').on('input', function(event) { 71 | setInterpolationImage(this.value); 72 | }); 73 | setInterpolationImage(0); 74 | $('#interpolation-slider').prop('max', NUM_INTERP_FRAMES - 1); 75 | 76 | bulmaSlider.attach(); 77 | 78 | }) 79 | -------------------------------------------------------------------------------- /docs/static/js/scroll.js: -------------------------------------------------------------------------------- 1 | const scrollContainer = document.querySelector('.scroll-container'); 2 | let isUserInteracting = false; 3 | let autoScrollInterval; 4 | let scrollSpeed = 0.8; 5 | 6 | function autoScroll() { 7 | if (!isUserInteracting) { 8 | scrollContainer.scrollLeft += scrollSpeed; 9 | if (scrollContainer.scrollLeft >= scrollContainer.scrollWidth - scrollContainer.offsetWidth) { 10 | scrollContainer.scrollLeft = 0; 11 | } 12 | } 13 | } 14 | 15 | scrollContainer.addEventListener('mousedown', () => { 16 | isUserInteracting = true; 17 | }); 18 | 19 | scrollContainer.addEventListener('mouseup', () => { 20 | isUserInteracting = false; 21 | }); 22 | 23 | function startAutoScroll() { 24 | autoScrollInterval = setInterval(autoScroll, 20); 25 | } 26 | 27 | 28 | function stopAutoScroll() { 29 | clearInterval(autoScrollInterval); 30 | } 31 | 32 | 33 | scrollContainer.addEventListener('mouseover', stopAutoScroll); 34 | 35 | scrollContainer.addEventListener('mouseout', startAutoScroll); 36 | 37 | 38 | // 初始化 39 | startAutoScroll(); -------------------------------------------------------------------------------- /docs/static/js/video_comparison.js: -------------------------------------------------------------------------------- 1 | // This is based on: http://thenewcode.com/364/Interactive-Before-and-After-Video-Comparison-in-HTML5-Canvas 2 | // With additional modifications based on: https://jsfiddle.net/7sk5k4gp/13/ 3 | 4 | function playVids(videoId) { 5 | var videoMerge = document.getElementById(videoId + "Merge"); 6 | var vid = document.getElementById(videoId); 7 | 8 | var position = 0.5; 9 | var vidWidth = vid.videoWidth/2; 10 | var vidHeight = vid.videoHeight; 11 | 12 | var mergeContext = videoMerge.getContext("2d"); 13 | 14 | 15 | if (vid.readyState > 3) { 16 | vid.play(); 17 | 18 | function trackLocation(e) { 19 | // Normalize to [0, 1] 20 | bcr = videoMerge.getBoundingClientRect(); 21 | position = ((e.pageX - bcr.x) / bcr.width); 22 | } 23 | function trackLocationTouch(e) { 24 | // Normalize to [0, 1] 25 | bcr = videoMerge.getBoundingClientRect(); 26 | position = ((e.touches[0].pageX - bcr.x) / bcr.width); 27 | } 28 | 29 | videoMerge.addEventListener("mousemove", trackLocation, false); 30 | videoMerge.addEventListener("touchstart", trackLocationTouch, false); 31 | videoMerge.addEventListener("touchmove", trackLocationTouch, false); 32 | 33 | 34 | function drawLoop() { 35 | mergeContext.drawImage(vid, 0, 0, vidWidth, vidHeight, 0, 0, vidWidth, vidHeight); 36 | var colStart = (vidWidth * position).clamp(0.0, vidWidth); 37 | var colWidth = (vidWidth - (vidWidth * position)).clamp(0.0, vidWidth); 38 | mergeContext.drawImage(vid, colStart+vidWidth, 0, colWidth, vidHeight, colStart, 0, colWidth, vidHeight); 39 | requestAnimationFrame(drawLoop); 40 | 41 | 42 | var arrowLength = 0.07 * vidHeight; 43 | var arrowheadWidth = 0.020 * vidHeight; 44 | var arrowheadLength = 0.04 * vidHeight; 45 | var arrowPosY = vidHeight / 10; 46 | var arrowWidth = 0.007 * vidHeight; 47 | var currX = vidWidth * position; 48 | 49 | // Draw circle 50 | mergeContext.arc(currX, arrowPosY, arrowLength*0.7, 0, Math.PI * 2, false); 51 | mergeContext.fillStyle = "#FFD79340"; 52 | mergeContext.fill() 53 | //mergeContext.strokeStyle = "#444444"; 54 | //mergeContext.stroke() 55 | 56 | // Draw border 57 | mergeContext.beginPath(); 58 | mergeContext.moveTo(vidWidth*position, 0); 59 | mergeContext.lineTo(vidWidth*position, vidHeight); 60 | mergeContext.closePath() 61 | mergeContext.strokeStyle = "#444444"; 62 | mergeContext.lineWidth = 3; 63 | mergeContext.stroke(); 64 | 65 | // Draw arrow 66 | mergeContext.beginPath(); 67 | mergeContext.moveTo(currX, arrowPosY - arrowWidth/2); 68 | 69 | // Move right until meeting arrow head 70 | mergeContext.lineTo(currX + arrowLength/2 - arrowheadLength/2, arrowPosY - arrowWidth/2); 71 | 72 | // Draw right arrow head 73 | mergeContext.lineTo(currX + arrowLength/2 - arrowheadLength/2, arrowPosY - arrowheadWidth/2); 74 | mergeContext.lineTo(currX + arrowLength/2, arrowPosY); 75 | mergeContext.lineTo(currX + arrowLength/2 - arrowheadLength/2, arrowPosY + arrowheadWidth/2); 76 | mergeContext.lineTo(currX + arrowLength/2 - arrowheadLength/2, arrowPosY + arrowWidth/2); 77 | 78 | // Go back to the left until meeting left arrow head 79 | mergeContext.lineTo(currX - arrowLength/2 + arrowheadLength/2, arrowPosY + arrowWidth/2); 80 | 81 | // Draw left arrow head 82 | mergeContext.lineTo(currX - arrowLength/2 + arrowheadLength/2, arrowPosY + arrowheadWidth/2); 83 | mergeContext.lineTo(currX - arrowLength/2, arrowPosY); 84 | mergeContext.lineTo(currX - arrowLength/2 + arrowheadLength/2, arrowPosY - arrowheadWidth/2); 85 | mergeContext.lineTo(currX - arrowLength/2 + arrowheadLength/2, arrowPosY); 86 | 87 | mergeContext.lineTo(currX - arrowLength/2 + arrowheadLength/2, arrowPosY - arrowWidth/2); 88 | mergeContext.lineTo(currX, arrowPosY - arrowWidth/2); 89 | 90 | mergeContext.closePath(); 91 | 92 | mergeContext.fillStyle = "#444444"; 93 | mergeContext.fill(); 94 | 95 | 96 | 97 | } 98 | requestAnimationFrame(drawLoop); 99 | } 100 | } 101 | 102 | Number.prototype.clamp = function(min, max) { 103 | return Math.min(Math.max(this, min), max); 104 | }; 105 | 106 | 107 | function resizeAndPlay(element) 108 | { 109 | var cv = document.getElementById(element.id + "Merge"); 110 | cv.width = element.videoWidth/2; 111 | cv.height = element.videoHeight; 112 | element.play(); 113 | element.style.height = "0px"; // Hide video without stopping it 114 | 115 | playVids(element.id); 116 | } 117 | -------------------------------------------------------------------------------- /gradio_demos/lumos_I2I.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import simple_parsing 3 | import numpy as np 4 | import torch 5 | from torchvision.utils import make_grid 6 | import random 7 | import torch 8 | from diffusers.models import AutoencoderKL 9 | from lumos_diffusion import DPMS_INTER 10 | from utils.download import find_model 11 | import lumos_diffusion.model.dino.vision_transformer as vits 12 | import torchvision.transforms as T 13 | from lumos_diffusion.model.lumos import LumosI2I_XL_2 14 | from utils import find_model 15 | 16 | _TITLE = 'Lumos-I2I: Image Interpolation Generation' 17 | MAX_SEED = 2147483647 18 | def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: 19 | if randomize_seed: 20 | seed = random.randint(0, MAX_SEED) 21 | return seed 22 | 23 | def dividable(n): 24 | for i in range(int(np.sqrt(n)), 0, -1): 25 | if n % i == 0: 26 | break 27 | return i, n // i 28 | 29 | 30 | def stop_run(): 31 | return ( 32 | gr.update(value="Run", variant="primary", visible=True), 33 | gr.update(visible=False), 34 | ) 35 | 36 | def generate( 37 | prompt_img1, 38 | prompt_img2, 39 | bsz, 40 | guidance_scale=4.5, 41 | num_inference_steps=20, 42 | seed=10, 43 | randomize_seed=True 44 | ): 45 | seed = int(randomize_seed_fn(seed, randomize_seed)) 46 | np.random.seed(seed) 47 | torch.random.manual_seed(seed) 48 | vae, dino, transform, model = models["vae"], models["vision_encoder"], models["transform"], models["diffusion"] 49 | prompt_img1 = transform(prompt_img1).unsqueeze(0) 50 | prompt_img2 = transform(prompt_img2).unsqueeze(0) 51 | prompt_imgs = torch.cat([prompt_img1, prompt_img2], dim=0) 52 | with torch.no_grad(): 53 | caption_embs = dino(prompt_imgs.to(device)) 54 | caption_embs = torch.nn.functional.normalize(caption_embs, dim=-1).unsqueeze(1).unsqueeze(1) 55 | caption_emb1 = caption_embs[0] 56 | caption_emb2 = caption_embs[-1] 57 | weights = np.arange(0, 1, 1/bsz).tolist() 58 | caption_embs = [caption_emb2 * wei + caption_emb1 * (1-wei) for wei in weights] 59 | caption_embs = torch.stack(caption_embs).to(device) 60 | bsz = caption_embs.shape[0] 61 | null_y = model.y_embedder.y_embedding[None].repeat(bsz, 1, 1)[:, None] 62 | z = torch.randn(1, 4, 32, 32, device=device).repeat(bsz, 1, 1, 1) 63 | model_kwargs = dict(mask=None) 64 | dpm_solver = DPMS_INTER(model.forward_with_dpmsolver, 65 | condition=caption_embs, 66 | uncondition=null_y, 67 | cfg_scale=guidance_scale, 68 | model_kwargs=model_kwargs) 69 | output = dpm_solver.sample( 70 | z, 71 | steps=num_inference_steps, 72 | order=2, 73 | skip_type="time_uniform", 74 | method="multistep") 75 | output = vae.decode(output / 0.18215).sample 76 | output = torch.clamp(output * 0.5 + 0.5, min=0, max=1).cpu() 77 | output = ( 78 | make_grid(output, nrow=output.shape[0] // 3, padding=3, pad_value=1).permute(1, 2, 0).numpy() * 255 79 | ).astype(np.uint8) 80 | step = num_inference_steps 81 | yield output, seed, gr.update( 82 | value="Run", 83 | variant="primary", 84 | visible=(step == num_inference_steps), 85 | ), gr.update( 86 | value="Stop", variant="stop", visible=(step != num_inference_steps) 87 | ) 88 | 89 | 90 | def demo(args): 91 | css = """ 92 | #col-container { 93 | margin: 0 auto; 94 | max-width: 640px; 95 | } 96 | """ 97 | demo = gr.Blocks(css=css) 98 | with demo: 99 | with gr.Column(elem_id="col-container"): 100 | gr.Markdown('# ' + _TITLE) 101 | gr.Markdown("You can get various visual effects by adjusting the hyper-parameters in Advanced settings.") 102 | pid = gr.State() 103 | with gr.Row(equal_height=True): 104 | prompt_image1 = gr.Image(type="pil", label="Input Image 1") 105 | prompt_image2 = gr.Image(type="pil", label="Input Image 2") 106 | with gr.Row(equal_height=True): 107 | num_generation = gr.Slider( 108 | value=12, 109 | minimum=1, 110 | maximum=100, 111 | step=2, 112 | label="Generation Num", 113 | ) 114 | run_btn = gr.Button(value="Run", variant="primary", scale=1) 115 | stop_btn = gr.Button(value="Stop", variant="stop", visible=False) 116 | with gr.Row(equal_height=False): 117 | output_image = gr.Image(value=None, label="Output image") 118 | with gr.Accordion( 119 | "Advanced settings", open=False, elem_id="config-accordion" 120 | ): 121 | with gr.Row(equal_height=False): 122 | num_inference_steps = gr.Slider( 123 | value=20, 124 | minimum=1, 125 | maximum=2000, 126 | step=1, 127 | label="# of steps", 128 | ) 129 | guidance_scale = gr.Slider( 130 | value=4.5, 131 | minimum=0.0, 132 | maximum=50, 133 | step=0.1, 134 | label="Guidance scale", 135 | ) 136 | randomize_seed = gr.Checkbox(label="Randomize seed", value=False) 137 | seed = gr.Slider( 138 | value=137, 139 | minimum=0, 140 | maximum=MAX_SEED, 141 | step=1, 142 | label="Random seed", 143 | ) 144 | 145 | run_event = run_btn.click( 146 | fn=generate, 147 | inputs=[ 148 | prompt_image1, 149 | prompt_image2, 150 | num_generation, 151 | guidance_scale, 152 | num_inference_steps, 153 | seed, 154 | randomize_seed 155 | ], 156 | outputs=[ 157 | output_image, 158 | seed, 159 | run_btn, 160 | stop_btn, 161 | ], 162 | ) 163 | 164 | stop_btn.click( 165 | fn=stop_run, 166 | outputs=[run_btn, stop_btn], 167 | cancels=[run_event], 168 | queue=False, 169 | ) 170 | with gr.Row(equal_height=False): 171 | example_images_1 = ["asset/images/car/image_start.png", "asset/images/cat/image_start.JPG", "asset/images/folwer/image_start.png"] 172 | example_images_2 = ["asset/images/car/image_end.png", "asset/images/cat/image_end.JPG", "asset/images/folwer/image_end.png"] 173 | example = gr.Examples( 174 | examples=[[t[0].strip(), t[-1].strip()] for t in zip(example_images_1, example_images_2)], 175 | inputs=[prompt_image1, prompt_image2], 176 | ) 177 | 178 | launch_args = {"server_port": int(args.port), "server_name": "0.0.0.0"} 179 | demo.queue(default_concurrency_limit=1).launch(**launch_args) 180 | 181 | 182 | if __name__ == "__main__": 183 | parser = simple_parsing.ArgumentParser(description="Lumos Image Interpolation Generation Demo") 184 | parser.add_argument("--vae-pretrained", type=str, default="stabilityai/sd-vae-ft-mse") 185 | parser.add_argument("--dino-type", type=str, default="vit_base") 186 | parser.add_argument("--dino-pretrained", type=str, default="./checkpoints/dino_vitbase16_pretrain.pth") 187 | parser.add_argument("--lumos-i2i-ckpt", type=str, default="./checkpoints/Lumos_I2I.pth") 188 | parser.add_argument("--port", type=int, default=19231) 189 | args = parser.parse_known_args()[0] 190 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 191 | # setting models 192 | models = dict() 193 | ## autoencoder 194 | weight_dtype = torch.float32 195 | vae = AutoencoderKL.from_pretrained(args.vae_pretrained).cuda() 196 | vae.eval() 197 | vae.to(weight_dtype) 198 | models["vae"] = vae 199 | ## vision encoder 200 | dino = vits.__dict__[args.dino_type](patch_size=16, num_classes=0).cuda() 201 | state_dict = torch.load(args.dino_pretrained, map_location="cpu") 202 | # remove `module.` prefix 203 | state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} 204 | # remove `backbone.` prefix induced by multicrop wrapper 205 | state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()} 206 | msg = dino.load_state_dict(state_dict, strict=False) 207 | del state_dict 208 | dino.eval() 209 | models["vision_encoder"] = dino 210 | ## transform for vision encoder 211 | transform = [ 212 | T.Lambda(lambda img: img.convert('RGB')), 213 | T.Resize(224), # Image.BICUBIC 214 | T.CenterCrop(224), 215 | T.ToTensor(), 216 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), 217 | ] 218 | 219 | transform = T.Compose(transform) 220 | models["transform"] = transform 221 | ## diffusion model 222 | model_kwargs={"window_block_indexes": [], "window_size": 0, 223 | "use_rel_pos": False, "lewei_scale": 1.0, 224 | "caption_channels": dino.embed_dim, 'model_max_length': 1} 225 | # build models 226 | image_size = 256 227 | latent_size = int(image_size) // 8 228 | model = LumosI2I_XL_2(input_size=latent_size, **model_kwargs).to(device) 229 | state_dict = find_model(args.lumos_i2i_ckpt) 230 | missing, unexpected = model.load_state_dict(state_dict, strict=False) 231 | model.eval() 232 | model.to(weight_dtype) 233 | models["diffusion"] = model 234 | 235 | demo(args) 236 | -------------------------------------------------------------------------------- /gradio_demos/lumos_T2I.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import simple_parsing 3 | import numpy as np 4 | import torch 5 | from torchvision.utils import make_grid 6 | import random 7 | import torch 8 | from diffusers.models import AutoencoderKL 9 | from lumos_diffusion import DPMS 10 | from utils.download import find_model 11 | from lumos_diffusion.model.t5 import T5Embedder 12 | from lumos_diffusion.model.lumos import LumosT2IMS_XL_2 13 | from utils import find_model, get_closest_ratio, ASPECT_RATIO_1024_TEST 14 | 15 | _TITLE = 'Lumos-T2I: Zero-shot Text to Image' 16 | MAX_SEED = 2147483647 17 | 18 | def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: 19 | if randomize_seed: 20 | seed = random.randint(0, MAX_SEED) 21 | return seed 22 | 23 | def dividable(n): 24 | for i in range(int(np.sqrt(n)), 0, -1): 25 | if n % i == 0: 26 | break 27 | return i, n // i 28 | 29 | 30 | def stop_run(): 31 | return ( 32 | gr.update(value="Run", variant="primary", visible=True), 33 | gr.update(visible=False), 34 | ) 35 | 36 | def generate( 37 | height=1024, 38 | width=1024, 39 | prompt="a chair", 40 | guidance_scale=4.5, 41 | num_inference_steps=250, 42 | seed=10, 43 | randomize_seed=True 44 | ): 45 | seed = int(randomize_seed_fn(seed, randomize_seed)) 46 | np.random.seed(seed) 47 | torch.random.manual_seed(seed) 48 | bsz = 1 49 | vae, t5, model = models["vae"], models["language_encoder"], models["diffusion"] 50 | prompt = prompt.strip() if prompt.endswith('.') else prompt 51 | close_hw, close_ratio = get_closest_ratio(height, width, ratios=ASPECT_RATIO_1024_TEST) 52 | output_comment = f"Convert Height: {height}, Width: {width} to [{close_hw[0]}, {close_hw[1]}]." 53 | hw, ar = torch.tensor([close_hw], dtype=torch.float, device=device), torch.tensor([[float(close_ratio)]], device=device) 54 | latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8) 55 | prompts = [prompt] * bsz 56 | with torch.no_grad(): 57 | caption_embs, emb_masks = t5.get_text_embeddings(prompts) 58 | caption_embs = caption_embs.float()[:, None] 59 | null_y = model.y_embedder.y_embedding[None].repeat(bsz, 1, 1)[:, None] 60 | z = torch.randn(bsz, 4, latent_size_h, latent_size_w, device=device) 61 | model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks) 62 | dpm_solver = DPMS(model.forward_with_dpmsolver, 63 | condition=caption_embs, 64 | uncondition=null_y, 65 | cfg_scale=guidance_scale, 66 | model_kwargs=model_kwargs) 67 | output = dpm_solver.sample( 68 | z, 69 | steps=num_inference_steps, 70 | order=2, 71 | skip_type="time_uniform", 72 | method="multistep") 73 | output = vae.decode(output / 0.18215).sample 74 | output = torch.clamp(output * 0.5 + 0.5, min=0, max=1).cpu() 75 | output = ( 76 | make_grid(output, nrow=dividable(bsz)[0]).permute(1, 2, 0).numpy() * 255 77 | ).astype(np.uint8) 78 | step = num_inference_steps 79 | yield output, seed, close_hw[0], close_hw[1], gr.update( 80 | value="Run", 81 | variant="primary", 82 | visible=(step == num_inference_steps), 83 | ), gr.update( 84 | value="Stop", variant="stop", visible=(step != num_inference_steps) 85 | ) 86 | 87 | 88 | def demo(args): 89 | css = """ 90 | #col-container { 91 | margin: 0 auto; 92 | max-width: 640px; 93 | } 94 | """ 95 | example_texts = open("asset/samples.txt").readlines() 96 | demo = gr.Blocks(css=css) 97 | with demo: 98 | with gr.Column(elem_id="col-container"): 99 | gr.Markdown('# ' + _TITLE) 100 | pid = gr.State() 101 | with gr.Row(equal_height=True): 102 | prompt_input = gr.Text( 103 | label="Prompt", 104 | show_label=False, 105 | max_lines=1, 106 | placeholder="Enter your prompt", 107 | container=False, 108 | scale=5 109 | ) 110 | run_btn = gr.Button(value="Run", variant="primary", scale=1) 111 | stop_btn = gr.Button(value="Stop", variant="stop", visible=False) 112 | with gr.Row(equal_height=False): 113 | output_image = gr.Image(value=None, label="Output image") 114 | with gr.Accordion( 115 | "Advanced settings", open=False, elem_id="config-accordion" 116 | ): 117 | with gr.Row(equal_height=False): 118 | num_inference_steps = gr.Slider( 119 | value=20, 120 | minimum=1, 121 | maximum=2000, 122 | step=1, 123 | label="# of steps", 124 | ) 125 | guidance_scale = gr.Slider( 126 | value=4.5, 127 | minimum=0.0, 128 | maximum=50, 129 | step=0.1, 130 | label="Guidance scale", 131 | ) 132 | with gr.Row(equal_height=False): 133 | height = gr.Slider( 134 | value=1024, 135 | minimum=512, 136 | maximum=2048, 137 | step=32, 138 | label="Height", 139 | ) 140 | width = gr.Slider( 141 | value=1024, 142 | minimum=512, 143 | maximum=2048, 144 | step=32, 145 | label="Width", 146 | ) 147 | randomize_seed = gr.Checkbox(label="Randomize seed", value=True) 148 | seed = gr.Slider( 149 | value=10, 150 | minimum=0, 151 | maximum=MAX_SEED, 152 | step=1, 153 | label="Random seed", 154 | ) 155 | 156 | run_event = run_btn.click( 157 | fn=generate, 158 | inputs=[ 159 | height, 160 | width, 161 | prompt_input, 162 | guidance_scale, 163 | num_inference_steps, 164 | seed, 165 | randomize_seed 166 | ], 167 | outputs=[ 168 | output_image, 169 | seed, 170 | height, 171 | width, 172 | run_btn, 173 | stop_btn, 174 | ], 175 | ) 176 | 177 | stop_btn.click( 178 | fn=stop_run, 179 | outputs=[run_btn, stop_btn], 180 | cancels=[run_event], 181 | queue=False, 182 | ) 183 | 184 | example0 = gr.Examples( 185 | examples=[[t.strip()] for t in example_texts], 186 | inputs=[prompt_input], 187 | ) 188 | 189 | launch_args = {"server_port": int(args.port), "server_name": "0.0.0.0"} 190 | demo.queue(default_concurrency_limit=1).launch(**launch_args) 191 | 192 | 193 | if __name__ == "__main__": 194 | parser = simple_parsing.ArgumentParser(description="Lumos Text to Image Generation Demo") 195 | parser.add_argument("--vae-pretrained", type=str, default="stabilityai/sd-vae-ft-mse") 196 | parser.add_argument("--t5-path", type=str, default="./checkpoints/") 197 | parser.add_argument("--lumos-t2i-ckpt", type=str, default="./checkpoints/Lumos_T2I.pth") 198 | parser.add_argument("--port", type=int, default=19231) 199 | args = parser.parse_known_args()[0] 200 | if torch.cuda.is_available(): 201 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 202 | # setting models 203 | models = dict() 204 | ## autoencoder 205 | weight_dtype = torch.float16 206 | vae = AutoencoderKL.from_pretrained(args.vae_pretrained).cuda() 207 | vae.eval() 208 | models["vae"] = vae 209 | ## language encoder 210 | t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=args.t5_path, torch_dtype=torch.float) 211 | models["language_encoder"] = t5 212 | ## diffusion model 213 | model_kwargs={"window_block_indexes": [], "window_size": 0, 214 | "use_rel_pos": False, "lewei_scale": 2.0} 215 | # build models 216 | image_size = 1024 217 | latent_size = int(image_size) // 8 218 | model = LumosT2IMS_XL_2(input_size=latent_size, **model_kwargs).to(device) 219 | state_dict = find_model(args.lumos_t2i_ckpt) 220 | missing, unexpected = model.load_state_dict(state_dict, strict=False) 221 | model.eval() 222 | model.to(weight_dtype) 223 | models["diffusion"] = model 224 | else: 225 | raise ValueError("This Demo need gpu") 226 | 227 | demo(args) 228 | -------------------------------------------------------------------------------- /lumos_diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | from .dpm_solver import DPMS 2 | from .dpm_solver_inter import DPMS as DPMS_INTER -------------------------------------------------------------------------------- /lumos_diffusion/dpm_solver.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .model import gaussian_diffusion as gd 3 | from .model.dpm_solver import model_wrapper, DPM_Solver, NoiseScheduleVP 4 | 5 | 6 | def DPMS(model, condition, uncondition, cfg_scale, model_type='noise', noise_schedule="linear", guidance_type='classifier-free', model_kwargs=None, diffusion_steps=1000): 7 | if model_kwargs is None: 8 | model_kwargs = {} 9 | betas = torch.tensor(gd.get_named_beta_schedule(noise_schedule, diffusion_steps)) 10 | 11 | ## 1. Define the noise schedule. 12 | noise_schedule = NoiseScheduleVP(schedule='discrete', betas=betas) 13 | 14 | ## 2. Convert your discrete-time `model` to the continuous-time 15 | ## noise prediction model. Here is an example for a diffusion model 16 | ## `model` with the noise prediction type ("noise") . 17 | model_fn = model_wrapper( 18 | model, 19 | noise_schedule, 20 | model_type=model_type, 21 | model_kwargs=model_kwargs, 22 | guidance_type=guidance_type, 23 | condition=condition, 24 | unconditional_condition=uncondition, 25 | guidance_scale=cfg_scale, 26 | ) 27 | ## 3. Define dpm-solver and sample by multistep DPM-Solver. 28 | return DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") -------------------------------------------------------------------------------- /lumos_diffusion/dpm_solver_inter.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .model import gaussian_diffusion_inter as gd 3 | from .model.dpm_solver_inter import model_wrapper, DPM_Solver, NoiseScheduleVP 4 | 5 | 6 | def DPMS(model, condition, uncondition, cfg_scale, model_type='noise', noise_schedule="linear", guidance_type='classifier-free', model_kwargs=None, diffusion_steps=1000): 7 | if model_kwargs is None: 8 | model_kwargs = {} 9 | betas = torch.tensor(gd.get_named_beta_schedule(noise_schedule, diffusion_steps)) 10 | 11 | ## 1. Define the noise schedule. 12 | noise_schedule = NoiseScheduleVP(schedule='discrete', betas=betas) 13 | 14 | ## 2. Convert your discrete-time `model` to the continuous-time 15 | ## noise prediction model. Here is an example for a diffusion model 16 | ## `model` with the noise prediction type ("noise") . 17 | model_fn = model_wrapper( 18 | model, 19 | noise_schedule, 20 | model_type=model_type, 21 | model_kwargs=model_kwargs, 22 | guidance_type=guidance_type, 23 | condition=condition, 24 | unconditional_condition=uncondition, 25 | guidance_scale=cfg_scale, 26 | ) 27 | ## 3. Define dpm-solver and sample by multistep DPM-Solver. 28 | return DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") -------------------------------------------------------------------------------- /lumos_diffusion/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .lumos import * -------------------------------------------------------------------------------- /lumos_diffusion/model/builder.py: -------------------------------------------------------------------------------- 1 | from mmcv import Registry 2 | 3 | from lumos_diffusion.model.utils import set_grad_checkpoint 4 | 5 | MODELS = Registry('models') 6 | 7 | 8 | def build_model(cfg, use_grad_checkpoint=False, use_fp32_attention=False, gc_step=1, **kwargs): 9 | if isinstance(cfg, str): 10 | cfg = dict(type=cfg) 11 | model = MODELS.build(cfg, default_args=kwargs) 12 | if use_grad_checkpoint: 13 | set_grad_checkpoint(model, use_fp32_attention=use_fp32_attention, gc_step=gc_step) 14 | return model 15 | -------------------------------------------------------------------------------- /lumos_diffusion/model/diffusion_utils.py: -------------------------------------------------------------------------------- 1 | # Modified from OpenAI's diffusion repos 2 | # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py 3 | # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion 4 | # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py 5 | 6 | import numpy as np 7 | import torch as th 8 | 9 | 10 | def normal_kl(mean1, logvar1, mean2, logvar2): 11 | """ 12 | Compute the KL divergence between two gaussians. 13 | Shapes are automatically broadcasted, so batches can be compared to 14 | scalars, among other use cases. 15 | """ 16 | tensor = next( 17 | ( 18 | obj 19 | for obj in (mean1, logvar1, mean2, logvar2) 20 | if isinstance(obj, th.Tensor) 21 | ), 22 | None, 23 | ) 24 | assert tensor is not None, "at least one argument must be a Tensor" 25 | 26 | # Force variances to be Tensors. Broadcasting helps convert scalars to 27 | # Tensors, but it does not work for th.exp(). 28 | logvar1, logvar2 = [ 29 | x if isinstance(x, th.Tensor) else th.tensor(x, device=tensor.device) 30 | for x in (logvar1, logvar2) 31 | ] 32 | 33 | return 0.5 * ( 34 | -1.0 35 | + logvar2 36 | - logvar1 37 | + th.exp(logvar1 - logvar2) 38 | + ((mean1 - mean2) ** 2) * th.exp(-logvar2) 39 | ) 40 | 41 | 42 | def approx_standard_normal_cdf(x): 43 | """ 44 | A fast approximation of the cumulative distribution function of the 45 | standard normal. 46 | """ 47 | return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) 48 | 49 | 50 | def continuous_gaussian_log_likelihood(x, *, means, log_scales): 51 | """ 52 | Compute the log-likelihood of a continuous Gaussian distribution. 53 | :param x: the targets 54 | :param means: the Gaussian mean Tensor. 55 | :param log_scales: the Gaussian log stddev Tensor. 56 | :return: a tensor like x of log probabilities (in nats). 57 | """ 58 | centered_x = x - means 59 | inv_stdv = th.exp(-log_scales) 60 | normalized_x = centered_x * inv_stdv 61 | return th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob( 62 | normalized_x 63 | ) 64 | 65 | 66 | def discretized_gaussian_log_likelihood(x, *, means, log_scales): 67 | """ 68 | Compute the log-likelihood of a Gaussian distribution discretizing to a 69 | given image. 70 | :param x: the target images. It is assumed that this was uint8 values, 71 | rescaled to the range [-1, 1]. 72 | :param means: the Gaussian mean Tensor. 73 | :param log_scales: the Gaussian log stddev Tensor. 74 | :return: a tensor like x of log probabilities (in nats). 75 | """ 76 | assert x.shape == means.shape == log_scales.shape 77 | centered_x = x - means 78 | inv_stdv = th.exp(-log_scales) 79 | plus_in = inv_stdv * (centered_x + 1.0 / 255.0) 80 | cdf_plus = approx_standard_normal_cdf(plus_in) 81 | min_in = inv_stdv * (centered_x - 1.0 / 255.0) 82 | cdf_min = approx_standard_normal_cdf(min_in) 83 | log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) 84 | log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) 85 | cdf_delta = cdf_plus - cdf_min 86 | log_probs = th.where( 87 | x < -0.999, 88 | log_cdf_plus, 89 | th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), 90 | ) 91 | assert log_probs.shape == x.shape 92 | return log_probs 93 | -------------------------------------------------------------------------------- /lumos_diffusion/model/dino/vision_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Mostly copy-paste from timm library. 16 | https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py 17 | """ 18 | import math 19 | from functools import partial 20 | import torch 21 | import torch.nn as nn 22 | 23 | def _no_grad_trunc_normal_(tensor, mean, std, a, b): 24 | # Cut & paste from PyTorch official master until it's in a few official releases - RW 25 | # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf 26 | def norm_cdf(x): 27 | # Computes standard normal cumulative distribution function 28 | return (1. + math.erf(x / math.sqrt(2.))) / 2. 29 | 30 | if (mean < a - 2 * std) or (mean > b + 2 * std): 31 | print("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " 32 | "The distribution of values may be incorrect.", 33 | stacklevel=2) 34 | 35 | with torch.no_grad(): 36 | # Values are generated by using a truncated uniform distribution and 37 | # then using the inverse CDF for the normal distribution. 38 | # Get upper and lower cdf values 39 | l = norm_cdf((a - mean) / std) 40 | u = norm_cdf((b - mean) / std) 41 | 42 | # Uniformly fill tensor with values from [l, u], then translate to 43 | # [2l-1, 2u-1]. 44 | tensor.uniform_(2 * l - 1, 2 * u - 1) 45 | 46 | # Use inverse cdf transform for normal distribution to get truncated 47 | # standard normal 48 | tensor.erfinv_() 49 | 50 | # Transform to proper mean, std 51 | tensor.mul_(std * math.sqrt(2.)) 52 | tensor.add_(mean) 53 | 54 | # Clamp to ensure it's in the proper range 55 | tensor.clamp_(min=a, max=b) 56 | return tensor 57 | 58 | def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): 59 | return _no_grad_trunc_normal_(tensor, mean, std, a, b) 60 | 61 | 62 | def drop_path(x, drop_prob: float = 0., training: bool = False): 63 | if drop_prob == 0. or not training: 64 | return x 65 | keep_prob = 1 - drop_prob 66 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 67 | random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) 68 | random_tensor.floor_() # binarize 69 | output = x.div(keep_prob) * random_tensor 70 | return output 71 | 72 | 73 | class DropPath(nn.Module): 74 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). 75 | """ 76 | def __init__(self, drop_prob=None): 77 | super(DropPath, self).__init__() 78 | self.drop_prob = drop_prob 79 | 80 | def forward(self, x): 81 | return drop_path(x, self.drop_prob, self.training) 82 | 83 | 84 | class Mlp(nn.Module): 85 | def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): 86 | super().__init__() 87 | out_features = out_features or in_features 88 | hidden_features = hidden_features or in_features 89 | self.fc1 = nn.Linear(in_features, hidden_features) 90 | self.act = act_layer() 91 | self.fc2 = nn.Linear(hidden_features, out_features) 92 | self.drop = nn.Dropout(drop) 93 | 94 | def forward(self, x): 95 | x = self.fc1(x) 96 | x = self.act(x) 97 | x = self.drop(x) 98 | x = self.fc2(x) 99 | x = self.drop(x) 100 | return x 101 | 102 | 103 | class Attention(nn.Module): 104 | def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): 105 | super().__init__() 106 | self.num_heads = num_heads 107 | head_dim = dim // num_heads 108 | self.scale = qk_scale or head_dim ** -0.5 109 | 110 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 111 | self.attn_drop = nn.Dropout(attn_drop) 112 | self.proj = nn.Linear(dim, dim) 113 | self.proj_drop = nn.Dropout(proj_drop) 114 | 115 | def forward(self, x): 116 | B, N, C = x.shape 117 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 118 | q, k, v = qkv[0], qkv[1], qkv[2] 119 | 120 | attn = (q @ k.transpose(-2, -1)) * self.scale 121 | attn = attn.softmax(dim=-1) 122 | attn = self.attn_drop(attn) 123 | 124 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 125 | x = self.proj(x) 126 | x = self.proj_drop(x) 127 | return x, attn 128 | 129 | 130 | class Block(nn.Module): 131 | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., 132 | drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): 133 | super().__init__() 134 | self.norm1 = norm_layer(dim) 135 | self.attn = Attention( 136 | dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) 137 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 138 | self.norm2 = norm_layer(dim) 139 | mlp_hidden_dim = int(dim * mlp_ratio) 140 | self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) 141 | 142 | def forward(self, x, return_attention=False): 143 | y, attn = self.attn(self.norm1(x)) 144 | if return_attention: 145 | return attn 146 | x = x + self.drop_path(y) 147 | x = x + self.drop_path(self.mlp(self.norm2(x))) 148 | return x 149 | 150 | 151 | class PatchEmbed(nn.Module): 152 | """ Image to Patch Embedding 153 | """ 154 | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): 155 | super().__init__() 156 | num_patches = (img_size // patch_size) * (img_size // patch_size) 157 | self.img_size = img_size 158 | self.patch_size = patch_size 159 | self.num_patches = num_patches 160 | 161 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) 162 | 163 | def forward(self, x): 164 | B, C, H, W = x.shape 165 | x = self.proj(x).flatten(2).transpose(1, 2) 166 | return x 167 | 168 | 169 | class VisionTransformer(nn.Module): 170 | """ Vision Transformer """ 171 | def __init__(self, img_size=[224], patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12, 172 | num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., 173 | drop_path_rate=0., norm_layer=nn.LayerNorm, **kwargs): 174 | super().__init__() 175 | self.num_features = self.embed_dim = embed_dim 176 | 177 | self.patch_embed = PatchEmbed( 178 | img_size=img_size[0], patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) 179 | num_patches = self.patch_embed.num_patches 180 | 181 | self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) 182 | self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) 183 | self.pos_drop = nn.Dropout(p=drop_rate) 184 | 185 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule 186 | self.blocks = nn.ModuleList([ 187 | Block( 188 | dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, 189 | drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer) 190 | for i in range(depth)]) 191 | self.norm = norm_layer(embed_dim) 192 | 193 | # Classifier head 194 | self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() 195 | 196 | trunc_normal_(self.pos_embed, std=.02) 197 | trunc_normal_(self.cls_token, std=.02) 198 | self.apply(self._init_weights) 199 | 200 | def _init_weights(self, m): 201 | if isinstance(m, nn.Linear): 202 | trunc_normal_(m.weight, std=.02) 203 | if isinstance(m, nn.Linear) and m.bias is not None: 204 | nn.init.constant_(m.bias, 0) 205 | elif isinstance(m, nn.LayerNorm): 206 | nn.init.constant_(m.bias, 0) 207 | nn.init.constant_(m.weight, 1.0) 208 | 209 | def interpolate_pos_encoding(self, x, w, h): 210 | npatch = x.shape[1] - 1 211 | N = self.pos_embed.shape[1] - 1 212 | if npatch == N and w == h: 213 | return self.pos_embed 214 | class_pos_embed = self.pos_embed[:, 0] 215 | patch_pos_embed = self.pos_embed[:, 1:] 216 | dim = x.shape[-1] 217 | w0 = w // self.patch_embed.patch_size 218 | h0 = h // self.patch_embed.patch_size 219 | # we add a small number to avoid floating point error in the interpolation 220 | # see discussion at https://github.com/facebookresearch/dino/issues/8 221 | w0, h0 = w0 + 0.1, h0 + 0.1 222 | patch_pos_embed = nn.functional.interpolate( 223 | patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2), 224 | scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), 225 | mode='bicubic', 226 | ) 227 | assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1] 228 | patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) 229 | return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) 230 | 231 | def prepare_tokens(self, x): 232 | B, nc, w, h = x.shape 233 | x = self.patch_embed(x) # patch linear embedding 234 | 235 | # add the [CLS] token to the embed patch tokens 236 | cls_tokens = self.cls_token.expand(B, -1, -1) 237 | x = torch.cat((cls_tokens, x), dim=1) 238 | 239 | # add positional encoding to each token 240 | x = x + self.interpolate_pos_encoding(x, w, h) 241 | 242 | return self.pos_drop(x) 243 | 244 | def forward(self, x): 245 | x = self.prepare_tokens(x) 246 | for blk in self.blocks: 247 | x = blk(x) 248 | x = self.norm(x) 249 | return x[:, 0] 250 | 251 | def get_last_selfattention(self, x): 252 | x = self.prepare_tokens(x) 253 | for i, blk in enumerate(self.blocks): 254 | if i < len(self.blocks) - 1: 255 | x = blk(x) 256 | else: 257 | # return attention of the last block 258 | return blk(x, return_attention=True) 259 | 260 | def get_intermediate_layers(self, x, n=1): 261 | x = self.prepare_tokens(x) 262 | # we return the output tokens from the `n` last blocks 263 | output = [] 264 | for i, blk in enumerate(self.blocks): 265 | x = blk(x) 266 | if len(self.blocks) - i <= n: 267 | output.append(self.norm(x)) 268 | return output 269 | 270 | 271 | def vit_tiny(patch_size=16, **kwargs): 272 | model = VisionTransformer( 273 | patch_size=patch_size, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, 274 | qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 275 | return model 276 | 277 | 278 | def vit_small(patch_size=16, **kwargs): 279 | model = VisionTransformer( 280 | patch_size=patch_size, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, 281 | qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 282 | return model 283 | 284 | 285 | def vit_base(patch_size=16, **kwargs): 286 | model = VisionTransformer( 287 | patch_size=patch_size, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, 288 | qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 289 | return model 290 | 291 | 292 | class DINOHead(nn.Module): 293 | def __init__(self, in_dim, out_dim, use_bn=False, norm_last_layer=True, nlayers=3, hidden_dim=2048, bottleneck_dim=256): 294 | super().__init__() 295 | nlayers = max(nlayers, 1) 296 | if nlayers == 1: 297 | self.mlp = nn.Linear(in_dim, bottleneck_dim) 298 | else: 299 | layers = [nn.Linear(in_dim, hidden_dim)] 300 | if use_bn: 301 | layers.append(nn.BatchNorm1d(hidden_dim)) 302 | layers.append(nn.GELU()) 303 | for _ in range(nlayers - 2): 304 | layers.append(nn.Linear(hidden_dim, hidden_dim)) 305 | if use_bn: 306 | layers.append(nn.BatchNorm1d(hidden_dim)) 307 | layers.append(nn.GELU()) 308 | layers.append(nn.Linear(hidden_dim, bottleneck_dim)) 309 | self.mlp = nn.Sequential(*layers) 310 | self.apply(self._init_weights) 311 | self.last_layer = nn.utils.weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False)) 312 | self.last_layer.weight_g.data.fill_(1) 313 | if norm_last_layer: 314 | self.last_layer.weight_g.requires_grad = False 315 | 316 | def _init_weights(self, m): 317 | if isinstance(m, nn.Linear): 318 | trunc_normal_(m.weight, std=.02) 319 | if isinstance(m, nn.Linear) and m.bias is not None: 320 | nn.init.constant_(m.bias, 0) 321 | 322 | def forward(self, x): 323 | x = self.mlp(x) 324 | x = nn.functional.normalize(x, dim=-1, p=2) 325 | x = self.last_layer(x) 326 | return x 327 | -------------------------------------------------------------------------------- /lumos_diffusion/model/lumos/LumosI2I.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import os 5 | import numpy as np 6 | from timm.models.layers import DropPath 7 | from timm.models.vision_transformer import PatchEmbed, Mlp 8 | 9 | from lumos_diffusion.model.builder import MODELS 10 | from lumos_diffusion.model.utils import auto_grad_checkpoint, to_2tuple 11 | from lumos_diffusion.model.lumos.Lumos_blocks import modulate, CaptionEmbedder, WindowAttention, MultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder 12 | 13 | class LumosI2IBlock(nn.Module): 14 | """ 15 | A LumosI2I block with adaptive layer norm (adaLN-zero) conditioning. 16 | """ 17 | 18 | def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None, use_rel_pos=False, **block_kwargs): 19 | super().__init__() 20 | self.hidden_size = hidden_size 21 | self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) 22 | self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True, 23 | input_size=input_size if window_size == 0 else (window_size, window_size), 24 | use_rel_pos=use_rel_pos, **block_kwargs) 25 | self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs) 26 | self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) 27 | # to be compatible with lower version pytorch 28 | approx_gelu = lambda: nn.GELU(approximate="tanh") 29 | self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0) 30 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 31 | self.window_size = window_size 32 | self.adaLN_modulation = nn.Sequential( 33 | nn.SiLU(), 34 | nn.Linear(hidden_size, 6 * hidden_size, bias=True) 35 | ) 36 | def forward(self, x, y, t, mask=None, **kwargs): 37 | B, N, C = x.shape 38 | shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(t).chunk(6, dim=1) 39 | x = x + self.drop_path(gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C)) 40 | x = x + self.cross_attn(x, y, mask) 41 | x = x + self.drop_path(gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))) 42 | 43 | return x 44 | 45 | ########################################################################### 46 | # Core Lumos Image-to-Image Model # 47 | ########################################################################### 48 | @MODELS.register_module() 49 | class LumosI2I(nn.Module): 50 | """ 51 | Diffusion model with a Transformer backbone. 52 | """ 53 | def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path: float = 0., window_size=0, window_block_indexes=None, use_rel_pos=False, caption_channels=4096, lewei_scale=1.0, config=None, model_max_length=1, **kwargs): 54 | if window_block_indexes is None: 55 | window_block_indexes = [] 56 | super().__init__() 57 | self.pred_sigma = pred_sigma 58 | self.in_channels = in_channels 59 | self.out_channels = in_channels * 2 if pred_sigma else in_channels 60 | self.patch_size = patch_size 61 | self.num_heads = num_heads 62 | self.lewei_scale = lewei_scale, 63 | 64 | self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True) 65 | self.t_embedder = TimestepEmbedder(hidden_size) 66 | num_patches = self.x_embedder.num_patches 67 | self.base_size = input_size // self.patch_size 68 | # Will use fixed sin-cos embedding: 69 | self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size)) 70 | approx_gelu = lambda: nn.GELU(approximate="tanh") 71 | self.y_embedder = CaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length) 72 | drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule 73 | self.blocks = nn.ModuleList([ 74 | LumosI2IBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], 75 | input_size=(input_size // patch_size, input_size // patch_size), 76 | window_size=window_size if i in window_block_indexes else 0, 77 | use_rel_pos=use_rel_pos if i in window_block_indexes else False) 78 | for i in range(depth) 79 | ]) 80 | self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels) 81 | 82 | self.initialize_weights() 83 | 84 | print(f'Warning: lewei scale: {self.lewei_scale}, base size: {self.base_size}') 85 | 86 | def forward(self, x, timestep, y, mask=None, data_info=None, **kwargs): 87 | """ 88 | Forward pass of Lumos-I2I. 89 | x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) 90 | t: (N,) tensor of diffusion timesteps 91 | y: (N, 1, 120, C) tensor of cond rep 92 | """ 93 | x = x.to(self.dtype) 94 | timestep = timestep.to(self.dtype) 95 | y = y.to(self.dtype) 96 | pos_embed = self.pos_embed.to(self.dtype) 97 | self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size 98 | x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2 99 | t = self.t_embedder(timestep.to(x.dtype)) # (N, D) 100 | y = self.y_embedder(y, self.training) # (N, 1, L, D) 101 | if mask is not None: 102 | if mask.shape[0] != y.shape[0]: 103 | mask = mask.repeat(y.shape[0] // mask.shape[0], 1) 104 | mask = mask.squeeze(1).squeeze(1) 105 | y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) 106 | y_lens = mask.sum(dim=1).tolist() 107 | else: 108 | y_lens = [y.shape[2]] * y.shape[0] 109 | y = y.squeeze(1).view(1, -1, x.shape[-1]) 110 | for block in self.blocks: 111 | x = auto_grad_checkpoint(block, x, y, t, y_lens) # (N, T, D) #support grad checkpoint 112 | x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) 113 | x = self.unpatchify(x) # (N, out_channels, H, W) 114 | return x 115 | 116 | def forward_with_dpmsolver(self, x, timestep, y, mask=None, **kwargs): 117 | """ 118 | dpm solver donnot need variance prediction 119 | """ 120 | # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb 121 | model_out = self.forward(x, timestep, y, mask) 122 | return model_out.chunk(2, dim=1)[0] 123 | 124 | def unpatchify(self, x): 125 | """ 126 | x: (N, T, patch_size**2 * C) 127 | imgs: (N, H, W, C) 128 | """ 129 | c = self.out_channels 130 | p = self.x_embedder.patch_size[0] 131 | h = w = int(x.shape[1] ** 0.5) 132 | assert h * w == x.shape[1] 133 | 134 | x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) 135 | x = torch.einsum('nhwpqc->nchpwq', x) 136 | return x.reshape(shape=(x.shape[0], c, h * p, h * p)) 137 | 138 | def initialize_weights(self): 139 | # Initialize transformer layers: 140 | def _basic_init(module): 141 | if isinstance(module, nn.Linear): 142 | torch.nn.init.xavier_uniform_(module.weight) 143 | if module.bias is not None: 144 | nn.init.constant_(module.bias, 0) 145 | 146 | self.apply(_basic_init) 147 | 148 | # Initialize (and freeze) pos_embed by sin-cos embedding: 149 | pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5), lewei_scale=self.lewei_scale, base_size=self.base_size) 150 | self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) 151 | 152 | # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): 153 | w = self.x_embedder.proj.weight.data 154 | nn.init.xavier_uniform_(w.view([w.shape[0], -1])) 155 | 156 | # Initialize timestep embedding MLP: 157 | nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) 158 | nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) 159 | for block in self.blocks: 160 | nn.init.constant_(block.adaLN_modulation[-1].weight, 0) 161 | nn.init.constant_(block.adaLN_modulation[-1].bias, 0) 162 | 163 | # Initialize caption embedding MLP: 164 | nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) 165 | nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) 166 | 167 | # Zero-out adaLN modulation layers in LumosI2I blocks: 168 | for block in self.blocks: 169 | nn.init.constant_(block.cross_attn.proj.weight, 0) 170 | nn.init.constant_(block.cross_attn.proj.bias, 0) 171 | 172 | # Zero-out output layers: 173 | nn.init.constant_(self.final_layer.linear.weight, 0) 174 | nn.init.constant_(self.final_layer.linear.bias, 0) 175 | 176 | @property 177 | def dtype(self): 178 | return next(self.parameters()).dtype 179 | 180 | 181 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, lewei_scale=1.0, base_size=16): 182 | """ 183 | grid_size: int of the grid height and width 184 | return: 185 | pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) 186 | """ 187 | if isinstance(grid_size, int): 188 | grid_size = to_2tuple(grid_size) 189 | grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0]/base_size) / lewei_scale 190 | grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1]/base_size) / lewei_scale 191 | grid = np.meshgrid(grid_w, grid_h) # here w goes first 192 | grid = np.stack(grid, axis=0) 193 | grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) 194 | 195 | pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) 196 | if cls_token and extra_tokens > 0: 197 | pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) 198 | return pos_embed 199 | 200 | 201 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): 202 | assert embed_dim % 2 == 0 203 | 204 | # use half of dimensions to encode grid_h 205 | emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) 206 | emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) 207 | 208 | return np.concatenate([emb_h, emb_w], axis=1) 209 | 210 | 211 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): 212 | """ 213 | embed_dim: output dimension for each position 214 | pos: a list of positions to be encoded: size (M,) 215 | out: (M, D) 216 | """ 217 | assert embed_dim % 2 == 0 218 | omega = np.arange(embed_dim // 2, dtype=np.float64) 219 | omega /= embed_dim / 2. 220 | omega = 1. / 10000 ** omega # (D/2,) 221 | 222 | pos = pos.reshape(-1) # (M,) 223 | out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product 224 | 225 | emb_sin = np.sin(out) # (M, D/2) 226 | emb_cos = np.cos(out) # (M, D/2) 227 | 228 | return np.concatenate([emb_sin, emb_cos], axis=1) 229 | 230 | 231 | ################################################################################# 232 | # Lumos Image-to-Image Configs # 233 | ################################################################################# 234 | @MODELS.register_module() 235 | def LumosI2I_XL_2(**kwargs): 236 | return LumosI2I(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs) 237 | -------------------------------------------------------------------------------- /lumos_diffusion/model/lumos/LumosT2I.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import os 5 | import numpy as np 6 | from timm.models.layers import DropPath 7 | from timm.models.vision_transformer import PatchEmbed, Mlp 8 | 9 | from lumos_diffusion.model.builder import MODELS 10 | from lumos_diffusion.model.utils import auto_grad_checkpoint, to_2tuple 11 | from lumos_diffusion.model.lumos.Lumos_blocks import modulate, CaptionEmbedder, WindowAttention, MultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder 12 | 13 | class LumosT2IBlock(nn.Module): 14 | """ 15 | A LumosT2I block with adaptive layer norm (adaLN-zero) conditioning. 16 | """ 17 | 18 | def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None, use_rel_pos=False, **block_kwargs): 19 | super().__init__() 20 | self.hidden_size = hidden_size 21 | self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) 22 | self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True, 23 | input_size=input_size if window_size == 0 else (window_size, window_size), 24 | use_rel_pos=use_rel_pos, **block_kwargs) 25 | self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs) 26 | self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) 27 | # to be compatible with lower version pytorch 28 | approx_gelu = lambda: nn.GELU(approximate="tanh") 29 | self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0) 30 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 31 | self.window_size = window_size 32 | self.adaLN_modulation = nn.Sequential( 33 | nn.SiLU(), 34 | nn.Linear(hidden_size, 6 * hidden_size, bias=True) 35 | ) 36 | def forward(self, x, y, t, mask=None, **kwargs): 37 | B, N, C = x.shape 38 | shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(t).chunk(6, dim=1) 39 | x = x + self.drop_path(gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C)) 40 | x = x + self.cross_attn(x, y, mask) 41 | x = x + self.drop_path(gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))) 42 | 43 | return x 44 | 45 | ############################################################################# 46 | # Core Lumos Text-to-Image Model # 47 | ############################################################################# 48 | @MODELS.register_module() 49 | class LumosT2I(nn.Module): 50 | """ 51 | Diffusion model with a Transformer backbone. 52 | """ 53 | 54 | def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path: float = 0., window_size=0, window_block_indexes=None, use_rel_pos=False, caption_channels=4096, lewei_scale=1.0, config=None, model_max_length=120, **kwargs): 55 | if window_block_indexes is None: 56 | window_block_indexes = [] 57 | super().__init__() 58 | self.pred_sigma = pred_sigma 59 | self.in_channels = in_channels 60 | self.out_channels = in_channels * 2 if pred_sigma else in_channels 61 | self.patch_size = patch_size 62 | self.num_heads = num_heads 63 | self.lewei_scale = lewei_scale, 64 | 65 | self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True) 66 | self.t_embedder = TimestepEmbedder(hidden_size) 67 | num_patches = self.x_embedder.num_patches 68 | self.base_size = input_size // self.patch_size 69 | # Will use fixed sin-cos embedding: 70 | self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size)) 71 | 72 | approx_gelu = lambda: nn.GELU(approximate="tanh") 73 | self.y_embedder = CaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length) 74 | drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule 75 | self.blocks = nn.ModuleList([ 76 | LumosT2IBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], 77 | input_size=(input_size // patch_size, input_size // patch_size), 78 | window_size=window_size if i in window_block_indexes else 0, 79 | use_rel_pos=use_rel_pos if i in window_block_indexes else False) 80 | for i in range(depth) 81 | ]) 82 | self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels) 83 | 84 | self.initialize_weights() 85 | 86 | print(f'Warning: lewei scale: {self.lewei_scale}, base size: {self.base_size}') 87 | 88 | def forward(self, x, timestep, y, mask=None, data_info=None, **kwargs): 89 | """ 90 | Forward pass of Lumos-T2I. 91 | x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) 92 | t: (N,) tensor of diffusion timesteps 93 | y: (N, 1, 120, C) tensor of caption embeddings 94 | """ 95 | x = x.to(self.dtype) 96 | timestep = timestep.to(self.dtype) 97 | y = y.to(self.dtype) 98 | pos_embed = self.pos_embed.to(self.dtype) 99 | self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size 100 | x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2 101 | t = self.t_embedder(timestep.to(x.dtype)) # (N, D) 102 | # t0 = self.t_block(t) 103 | y = self.y_embedder(y, self.training) # (N, 1, L, D) 104 | if mask is not None: 105 | if mask.shape[0] != y.shape[0]: 106 | mask = mask.repeat(y.shape[0] // mask.shape[0], 1) 107 | mask = mask.squeeze(1).squeeze(1) 108 | y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) 109 | y_lens = mask.sum(dim=1).tolist() 110 | else: 111 | y_lens = [y.shape[2]] * y.shape[0] 112 | y = y.squeeze(1).view(1, -1, x.shape[-1]) 113 | for block in self.blocks: 114 | x = auto_grad_checkpoint(block, x, y, t, y_lens) # (N, T, D) #support grad checkpoint 115 | x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) 116 | x = self.unpatchify(x) # (N, out_channels, H, W) 117 | return x 118 | 119 | def forward_with_dpmsolver(self, x, timestep, y, mask=None, **kwargs): 120 | """ 121 | dpm solver donnot need variance prediction 122 | """ 123 | # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb 124 | model_out = self.forward(x, timestep, y, mask) 125 | return model_out.chunk(2, dim=1)[0] 126 | 127 | def unpatchify(self, x): 128 | """ 129 | x: (N, T, patch_size**2 * C) 130 | imgs: (N, H, W, C) 131 | """ 132 | c = self.out_channels 133 | p = self.x_embedder.patch_size[0] 134 | h = w = int(x.shape[1] ** 0.5) 135 | assert h * w == x.shape[1] 136 | 137 | x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) 138 | x = torch.einsum('nhwpqc->nchpwq', x) 139 | return x.reshape(shape=(x.shape[0], c, h * p, h * p)) 140 | 141 | def initialize_weights(self): 142 | # Initialize transformer layers: 143 | def _basic_init(module): 144 | if isinstance(module, nn.Linear): 145 | torch.nn.init.xavier_uniform_(module.weight) 146 | if module.bias is not None: 147 | nn.init.constant_(module.bias, 0) 148 | 149 | self.apply(_basic_init) 150 | 151 | # Initialize (and freeze) pos_embed by sin-cos embedding: 152 | pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5), lewei_scale=self.lewei_scale, base_size=self.base_size) 153 | self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) 154 | 155 | # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): 156 | w = self.x_embedder.proj.weight.data 157 | nn.init.xavier_uniform_(w.view([w.shape[0], -1])) 158 | 159 | # Initialize timestep embedding MLP: 160 | nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) 161 | nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) 162 | for block in self.blocks: 163 | nn.init.constant_(block.adaLN_modulation[-1].weight, 0) 164 | nn.init.constant_(block.adaLN_modulation[-1].bias, 0) 165 | 166 | # Initialize caption embedding MLP: 167 | nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) 168 | nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) 169 | 170 | # Zero-out adaLN modulation layers in LumosT2I blocks: 171 | for block in self.blocks: 172 | nn.init.constant_(block.cross_attn.proj.weight, 0) 173 | nn.init.constant_(block.cross_attn.proj.bias, 0) 174 | 175 | # Zero-out output layers: 176 | nn.init.constant_(self.final_layer.linear.weight, 0) 177 | nn.init.constant_(self.final_layer.linear.bias, 0) 178 | 179 | @property 180 | def dtype(self): 181 | return next(self.parameters()).dtype 182 | 183 | 184 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, lewei_scale=1.0, base_size=16): 185 | """ 186 | grid_size: int of the grid height and width 187 | return: 188 | pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) 189 | """ 190 | if isinstance(grid_size, int): 191 | grid_size = to_2tuple(grid_size) 192 | grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0]/base_size) / lewei_scale 193 | grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1]/base_size) / lewei_scale 194 | grid = np.meshgrid(grid_w, grid_h) # here w goes first 195 | grid = np.stack(grid, axis=0) 196 | grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) 197 | 198 | pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) 199 | if cls_token and extra_tokens > 0: 200 | pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) 201 | return pos_embed 202 | 203 | 204 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): 205 | assert embed_dim % 2 == 0 206 | 207 | # use half of dimensions to encode grid_h 208 | emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) 209 | emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) 210 | 211 | return np.concatenate([emb_h, emb_w], axis=1) 212 | 213 | 214 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): 215 | """ 216 | embed_dim: output dimension for each position 217 | pos: a list of positions to be encoded: size (M,) 218 | out: (M, D) 219 | """ 220 | assert embed_dim % 2 == 0 221 | omega = np.arange(embed_dim // 2, dtype=np.float64) 222 | omega /= embed_dim / 2. 223 | omega = 1. / 10000 ** omega # (D/2,) 224 | 225 | pos = pos.reshape(-1) # (M,) 226 | out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product 227 | 228 | emb_sin = np.sin(out) # (M, D/2) 229 | emb_cos = np.cos(out) # (M, D/2) 230 | 231 | return np.concatenate([emb_sin, emb_cos], axis=1) 232 | 233 | 234 | ################################################################################# 235 | # LumosT2I Configs # 236 | ################################################################################# 237 | @MODELS.register_module() 238 | def LumosT2I_XL_2(**kwargs): 239 | return LumosT2I(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs) 240 | -------------------------------------------------------------------------------- /lumos_diffusion/model/lumos/LumosT2IMS.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # References: 8 | # GLIDE: https://github.com/openai/glide-text2im 9 | # MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py 10 | # -------------------------------------------------------- 11 | import torch 12 | import torch.nn as nn 13 | from timm.models.layers import DropPath 14 | from timm.models.vision_transformer import Mlp 15 | 16 | from lumos_diffusion.model.builder import MODELS 17 | from lumos_diffusion.model.utils import auto_grad_checkpoint, to_2tuple 18 | from lumos_diffusion.model.lumos.Lumos_blocks import modulate, CaptionEmbedder, WindowAttention, MultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, SizeEmbedder 19 | from lumos_diffusion.model.lumos.LumosT2I import LumosT2I, get_2d_sincos_pos_embed 20 | 21 | 22 | class PatchEmbed(nn.Module): 23 | """ 2D Image to Patch Embedding 24 | """ 25 | def __init__( 26 | self, 27 | patch_size=16, 28 | in_chans=3, 29 | embed_dim=768, 30 | norm_layer=None, 31 | flatten=True, 32 | bias=True, 33 | ): 34 | super().__init__() 35 | patch_size = to_2tuple(patch_size) 36 | self.patch_size = patch_size 37 | self.flatten = flatten 38 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias) 39 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 40 | 41 | def forward(self, x): 42 | x = self.proj(x) 43 | if self.flatten: 44 | x = x.flatten(2).transpose(1, 2) # BCHW -> BNC 45 | x = self.norm(x) 46 | return x 47 | 48 | 49 | class LumosT2IMSBlock(nn.Module): 50 | """ 51 | A LumosT2IMS block with adaptive layer norm zero (adaLN-Zero) conditioning. 52 | """ 53 | 54 | def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None, use_rel_pos=False, **block_kwargs): 55 | super().__init__() 56 | self.hidden_size = hidden_size 57 | self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) 58 | self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True, 59 | input_size=input_size if window_size == 0 else (window_size, window_size), 60 | use_rel_pos=use_rel_pos, **block_kwargs) 61 | self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs) 62 | self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) 63 | # to be compatible with lower version pytorch 64 | approx_gelu = lambda: nn.GELU(approximate="tanh") 65 | self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0) 66 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 67 | self.window_size = window_size 68 | # self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5) 69 | self.adaLN_modulation = nn.Sequential( 70 | nn.SiLU(), 71 | nn.Linear(hidden_size, 6 * hidden_size, bias=True) 72 | ) 73 | 74 | def forward(self, x, y, t, mask=None, **kwargs): 75 | B, N, C = x.shape 76 | shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(t).chunk(6, dim=1) 77 | # shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1) 78 | x = x + self.drop_path(gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C)) 79 | # x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa))) 80 | x = x + self.cross_attn(x, y, mask) 81 | # x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) 82 | x = x + self.drop_path(gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))) 83 | return x 84 | 85 | 86 | ###################################################################################################### 87 | # Core Lumos Text-to-Image Mluti-Scale Model # 88 | ###################################################################################################### 89 | @MODELS.register_module() 90 | class LumosT2IMS(LumosT2I): 91 | """ 92 | Diffusion model with a Transformer backbone. 93 | """ 94 | 95 | def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, learn_sigma=True, pred_sigma=True, drop_path: float = 0., window_size=0, window_block_indexes=None, use_rel_pos=False, caption_channels=4096, lewei_scale=1., config=None, model_max_length=120, **kwargs): 96 | if window_block_indexes is None: 97 | window_block_indexes = [] 98 | super().__init__( 99 | input_size=input_size, 100 | patch_size=patch_size, 101 | in_channels=in_channels, 102 | hidden_size=hidden_size, 103 | depth=depth, 104 | num_heads=num_heads, 105 | mlp_ratio=mlp_ratio, 106 | class_dropout_prob=class_dropout_prob, 107 | learn_sigma=learn_sigma, 108 | pred_sigma=pred_sigma, 109 | drop_path=drop_path, 110 | window_size=window_size, 111 | window_block_indexes=window_block_indexes, 112 | use_rel_pos=use_rel_pos, 113 | lewei_scale=lewei_scale, 114 | config=config, 115 | model_max_length=model_max_length, 116 | **kwargs, 117 | ) 118 | self.h = self.w = 0 119 | approx_gelu = lambda: nn.GELU(approximate="tanh") 120 | self.x_embedder = PatchEmbed(patch_size, in_channels, hidden_size, bias=True) 121 | self.y_embedder = CaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length) 122 | self.csize_embedder = SizeEmbedder(hidden_size//3) # c_size embed 123 | self.ar_embedder = SizeEmbedder(hidden_size//3) # aspect ratio embed 124 | drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule 125 | self.blocks = nn.ModuleList([ 126 | LumosT2IMSBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], 127 | input_size=(input_size // patch_size, input_size // patch_size), 128 | window_size=window_size if i in window_block_indexes else 0, 129 | use_rel_pos=use_rel_pos if i in window_block_indexes else False) 130 | for i in range(depth) 131 | ]) 132 | self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels) 133 | 134 | self.initialize() 135 | 136 | def forward(self, x, timestep, y, mask=None, data_info=None, **kwargs): 137 | """ 138 | Forward pass of Lumos-T2I MS. 139 | x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) 140 | t: (N,) tensor of diffusion timesteps 141 | y: (N, 1, 120, C) tensor of class labels 142 | """ 143 | bs = x.shape[0] 144 | x = x.to(self.dtype) 145 | timestep = timestep.to(self.dtype) 146 | y = y.to(self.dtype) 147 | c_size, ar = data_info['img_hw'].to(self.dtype), data_info['aspect_ratio'].to(self.dtype) 148 | self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size 149 | pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(x.device).to(self.dtype) 150 | x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2 151 | t = self.t_embedder(timestep) # (N, D) 152 | csize = self.csize_embedder(c_size, bs) # (N, D) 153 | ar = self.ar_embedder(ar, bs) # (N, D) 154 | t = t + torch.cat([csize, ar], dim=1) 155 | # t0 = self.t_block(t) 156 | y = self.y_embedder(y, self.training) # (N, D) 157 | if mask is not None: 158 | if mask.shape[0] != y.shape[0]: 159 | mask = mask.repeat(y.shape[0] // mask.shape[0], 1) 160 | mask = mask.squeeze(1).squeeze(1) 161 | y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) 162 | y_lens = mask.sum(dim=1).tolist() 163 | else: 164 | y_lens = [y.shape[2]] * y.shape[0] 165 | y = y.squeeze(1).view(1, -1, x.shape[-1]) 166 | for block in self.blocks: 167 | x = auto_grad_checkpoint(block, x, y, t, y_lens, **kwargs) # (N, T, D) #support grad checkpoint 168 | x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) 169 | x = self.unpatchify(x) # (N, out_channels, H, W) 170 | return x 171 | 172 | def forward_with_dpmsolver(self, x, timestep, y, data_info, **kwargs): 173 | """ 174 | dpm solver donnot need variance prediction 175 | """ 176 | # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb 177 | model_out = self.forward(x, timestep, y, data_info=data_info, **kwargs) 178 | return model_out.chunk(2, dim=1)[0] 179 | 180 | def unpatchify(self, x): 181 | """ 182 | x: (N, T, patch_size**2 * C) 183 | imgs: (N, H, W, C) 184 | """ 185 | c = self.out_channels 186 | p = self.x_embedder.patch_size[0] 187 | assert self.h * self.w == x.shape[1] 188 | 189 | x = x.reshape(shape=(x.shape[0], self.h, self.w, p, p, c)) 190 | x = torch.einsum('nhwpqc->nchpwq', x) 191 | return x.reshape(shape=(x.shape[0], c, self.h * p, self.w * p)) 192 | 193 | def initialize(self): 194 | # Initialize transformer layers: 195 | def _basic_init(module): 196 | if isinstance(module, nn.Linear): 197 | torch.nn.init.xavier_uniform_(module.weight) 198 | if module.bias is not None: 199 | nn.init.constant_(module.bias, 0) 200 | 201 | self.apply(_basic_init) 202 | 203 | # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): 204 | w = self.x_embedder.proj.weight.data 205 | nn.init.xavier_uniform_(w.view([w.shape[0], -1])) 206 | 207 | # Initialize timestep embedding MLP: 208 | nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) 209 | nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) 210 | nn.init.normal_(self.csize_embedder.mlp[0].weight, std=0.02) 211 | nn.init.normal_(self.csize_embedder.mlp[2].weight, std=0.02) 212 | nn.init.normal_(self.ar_embedder.mlp[0].weight, std=0.02) 213 | nn.init.normal_(self.ar_embedder.mlp[2].weight, std=0.02) 214 | 215 | for block in self.blocks: 216 | nn.init.constant_(block.adaLN_modulation[-1].weight, 0) 217 | nn.init.constant_(block.adaLN_modulation[-1].bias, 0) 218 | 219 | # Initialize caption embedding MLP: 220 | nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) 221 | nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) 222 | 223 | # Zero-out adaLN modulation layers in LumosT2IMS blocks: 224 | for block in self.blocks: 225 | nn.init.constant_(block.cross_attn.proj.weight, 0) 226 | nn.init.constant_(block.cross_attn.proj.bias, 0) 227 | 228 | # Zero-out output layers: 229 | nn.init.constant_(self.final_layer.linear.weight, 0) 230 | nn.init.constant_(self.final_layer.linear.bias, 0) 231 | 232 | 233 | ######################################################################################### 234 | # LumosT2IMS Configs # 235 | ######################################################################################### 236 | @MODELS.register_module() 237 | def LumosT2IMS_XL_2(**kwargs): 238 | return LumosT2IMS(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs) 239 | -------------------------------------------------------------------------------- /lumos_diffusion/model/lumos/Lumos_blocks.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from timm.models.vision_transformer import Mlp, Attention as Attention_ 5 | from einops import rearrange, repeat 6 | import xformers.ops 7 | 8 | from lumos_diffusion.model.utils import add_decomposed_rel_pos 9 | 10 | 11 | def modulate(x, shift, scale): 12 | return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) 13 | 14 | 15 | def t2i_modulate(x, shift, scale): 16 | return x * (1 + scale) + shift 17 | 18 | 19 | class MultiHeadCrossAttention(nn.Module): 20 | def __init__(self, d_model, num_heads, attn_drop=0., proj_drop=0., **block_kwargs): 21 | super(MultiHeadCrossAttention, self).__init__() 22 | assert d_model % num_heads == 0, "d_model must be divisible by num_heads" 23 | 24 | self.d_model = d_model 25 | self.num_heads = num_heads 26 | self.head_dim = d_model // num_heads 27 | 28 | self.q_linear = nn.Linear(d_model, d_model) 29 | self.kv_linear = nn.Linear(d_model, d_model*2) 30 | self.attn_drop = nn.Dropout(attn_drop) 31 | self.proj = nn.Linear(d_model, d_model) 32 | self.proj_drop = nn.Dropout(proj_drop) 33 | 34 | def forward(self, x, cond, mask=None): 35 | # query: img tokens; key/value: condition; mask: if padding tokens 36 | B, N, C = x.shape 37 | 38 | q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim) 39 | kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim) 40 | k, v = kv.unbind(2) 41 | attn_bias = None 42 | if mask is not None: 43 | attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask) 44 | x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) 45 | x = x.view(B, -1, C) 46 | x = self.proj(x) 47 | x = self.proj_drop(x) 48 | 49 | return x 50 | 51 | 52 | class WindowAttention(Attention_): 53 | """Multi-head Attention block with relative position embeddings.""" 54 | 55 | def __init__( 56 | self, 57 | dim, 58 | num_heads=8, 59 | qkv_bias=True, 60 | use_rel_pos=False, 61 | rel_pos_zero_init=True, 62 | input_size=None, 63 | **block_kwargs, 64 | ): 65 | """ 66 | Args: 67 | dim (int): Number of input channels. 68 | num_heads (int): Number of attention heads. 69 | qkv_bias (bool: If True, add a learnable bias to query, key, value. 70 | rel_pos (bool): If True, add relative positional embeddings to the attention map. 71 | rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. 72 | input_size (int or None): Input resolution for calculating the relative positional 73 | parameter size. 74 | """ 75 | super().__init__(dim, num_heads=num_heads, qkv_bias=qkv_bias, **block_kwargs) 76 | 77 | self.use_rel_pos = use_rel_pos 78 | if self.use_rel_pos: 79 | # initialize relative positional embeddings 80 | self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, self.head_dim)) 81 | self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, self.head_dim)) 82 | 83 | if not rel_pos_zero_init: 84 | nn.init.trunc_normal_(self.rel_pos_h, std=0.02) 85 | nn.init.trunc_normal_(self.rel_pos_w, std=0.02) 86 | 87 | def forward(self, x, mask=None): 88 | B, N, C = x.shape 89 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) 90 | q, k, v = qkv.unbind(2) 91 | if use_fp32_attention := getattr(self, 'fp32_attention', False): 92 | q, k, v = q.float(), k.float(), v.float() 93 | 94 | attn_bias = None 95 | if mask is not None: 96 | attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device) 97 | attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf')) 98 | x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias) 99 | 100 | x = x.view(B, N, C) 101 | x = self.proj(x) 102 | x = self.proj_drop(x) 103 | return x 104 | 105 | 106 | ################################################################################# 107 | # AMP attention with fp32 softmax to fix loss NaN problem during training # 108 | ################################################################################# 109 | class Attention(Attention_): 110 | def forward(self, x): 111 | B, N, C = x.shape 112 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 113 | q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) 114 | use_fp32_attention = getattr(self, 'fp32_attention', False) 115 | if use_fp32_attention: 116 | q, k = q.float(), k.float() 117 | with torch.cuda.amp.autocast(enabled=not use_fp32_attention): 118 | attn = (q @ k.transpose(-2, -1)) * self.scale 119 | attn = attn.softmax(dim=-1) 120 | 121 | attn = self.attn_drop(attn) 122 | 123 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 124 | x = self.proj(x) 125 | x = self.proj_drop(x) 126 | return x 127 | 128 | 129 | class FinalLayer(nn.Module): 130 | """ 131 | The final layer of Lumos. 132 | """ 133 | 134 | def __init__(self, hidden_size, patch_size, out_channels): 135 | super().__init__() 136 | self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) 137 | self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True) 138 | self.adaLN_modulation = nn.Sequential( 139 | nn.SiLU(), 140 | nn.Linear(hidden_size, 2 * hidden_size, bias=True) 141 | ) 142 | 143 | def forward(self, x, c): 144 | shift, scale = self.adaLN_modulation(c).chunk(2, dim=1) 145 | x = modulate(self.norm_final(x), shift, scale) 146 | x = self.linear(x) 147 | return x 148 | 149 | 150 | class T2IFinalLayer(nn.Module): 151 | """ 152 | The final layer of Lumos. 153 | """ 154 | 155 | def __init__(self, hidden_size, patch_size, out_channels): 156 | super().__init__() 157 | self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) 158 | self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True) 159 | self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size ** 0.5) 160 | self.out_channels = out_channels 161 | 162 | def forward(self, x, t): 163 | shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1) 164 | x = t2i_modulate(self.norm_final(x), shift, scale) 165 | x = self.linear(x) 166 | return x 167 | 168 | 169 | class MaskFinalLayer(nn.Module): 170 | """ 171 | The final layer of Lumos. 172 | """ 173 | 174 | def __init__(self, final_hidden_size, c_emb_size, patch_size, out_channels): 175 | super().__init__() 176 | self.norm_final = nn.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6) 177 | self.linear = nn.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True) 178 | self.adaLN_modulation = nn.Sequential( 179 | nn.SiLU(), 180 | nn.Linear(c_emb_size, 2 * final_hidden_size, bias=True) 181 | ) 182 | def forward(self, x, t): 183 | shift, scale = self.adaLN_modulation(t).chunk(2, dim=1) 184 | x = modulate(self.norm_final(x), shift, scale) 185 | x = self.linear(x) 186 | return x 187 | 188 | 189 | class DecoderLayer(nn.Module): 190 | """ 191 | The final layer of Lumos. 192 | """ 193 | 194 | def __init__(self, hidden_size, decoder_hidden_size): 195 | super().__init__() 196 | self.norm_decoder = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) 197 | self.linear = nn.Linear(hidden_size, decoder_hidden_size, bias=True) 198 | self.adaLN_modulation = nn.Sequential( 199 | nn.SiLU(), 200 | nn.Linear(hidden_size, 2 * hidden_size, bias=True) 201 | ) 202 | def forward(self, x, t): 203 | shift, scale = self.adaLN_modulation(t).chunk(2, dim=1) 204 | x = modulate(self.norm_decoder(x), shift, scale) 205 | x = self.linear(x) 206 | return x 207 | 208 | 209 | ################################################################################# 210 | # Embedding Layers for Timesteps and Class Labels # 211 | ################################################################################# 212 | class TimestepEmbedder(nn.Module): 213 | """ 214 | Embeds scalar timesteps into vector representations. 215 | """ 216 | 217 | def __init__(self, hidden_size, frequency_embedding_size=256): 218 | super().__init__() 219 | self.mlp = nn.Sequential( 220 | nn.Linear(frequency_embedding_size, hidden_size, bias=True), 221 | nn.SiLU(), 222 | nn.Linear(hidden_size, hidden_size, bias=True), 223 | ) 224 | self.frequency_embedding_size = frequency_embedding_size 225 | 226 | @staticmethod 227 | def timestep_embedding(t, dim, max_period=10000): 228 | """ 229 | Create sinusoidal timestep embeddings. 230 | :param t: a 1-D Tensor of N indices, one per batch element. 231 | These may be fractional. 232 | :param dim: the dimension of the output. 233 | :param max_period: controls the minimum frequency of the embeddings. 234 | :return: an (N, D) Tensor of positional embeddings. 235 | """ 236 | # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py 237 | half = dim // 2 238 | freqs = torch.exp( 239 | -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half) 240 | args = t[:, None].float() * freqs[None] 241 | embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) 242 | if dim % 2: 243 | embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) 244 | return embedding 245 | 246 | def forward(self, t): 247 | t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(self.dtype) 248 | return self.mlp(t_freq) 249 | 250 | @property 251 | def dtype(self): 252 | return next(self.parameters()).dtype 253 | 254 | 255 | class SizeEmbedder(TimestepEmbedder): 256 | """ 257 | Embeds scalar timesteps into vector representations. 258 | """ 259 | 260 | def __init__(self, hidden_size, frequency_embedding_size=256): 261 | super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size) 262 | self.mlp = nn.Sequential( 263 | nn.Linear(frequency_embedding_size, hidden_size, bias=True), 264 | nn.SiLU(), 265 | nn.Linear(hidden_size, hidden_size, bias=True), 266 | ) 267 | self.frequency_embedding_size = frequency_embedding_size 268 | self.outdim = hidden_size 269 | 270 | def forward(self, s, bs): 271 | if s.ndim == 1: 272 | s = s[:, None] 273 | assert s.ndim == 2 274 | if s.shape[0] != bs: 275 | s = s.repeat(bs//s.shape[0], 1) 276 | assert s.shape[0] == bs 277 | b, dims = s.shape[0], s.shape[1] 278 | s = rearrange(s, "b d -> (b d)") 279 | s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype) 280 | s_emb = self.mlp(s_freq) 281 | s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim) 282 | return s_emb 283 | 284 | @property 285 | def dtype(self): 286 | return next(self.parameters()).dtype 287 | 288 | 289 | class LabelEmbedder(nn.Module): 290 | """ 291 | Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. 292 | """ 293 | 294 | def __init__(self, num_classes, hidden_size, dropout_prob): 295 | super().__init__() 296 | use_cfg_embedding = dropout_prob > 0 297 | self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size) 298 | self.num_classes = num_classes 299 | self.dropout_prob = dropout_prob 300 | 301 | def token_drop(self, labels, force_drop_ids=None): 302 | """ 303 | Drops labels to enable classifier-free guidance. 304 | """ 305 | if force_drop_ids is None: 306 | drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob 307 | else: 308 | drop_ids = force_drop_ids == 1 309 | labels = torch.where(drop_ids, self.num_classes, labels) 310 | return labels 311 | 312 | def forward(self, labels, train, force_drop_ids=None): 313 | use_dropout = self.dropout_prob > 0 314 | if (train and use_dropout) or (force_drop_ids is not None): 315 | labels = self.token_drop(labels, force_drop_ids) 316 | return self.embedding_table(labels) 317 | 318 | 319 | class CaptionEmbedder(nn.Module): 320 | """ 321 | Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. 322 | """ 323 | 324 | def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120): 325 | super().__init__() 326 | self.y_proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0) 327 | self.register_buffer("y_embedding", nn.Parameter(torch.randn(token_num, in_channels) / in_channels ** 0.5)) 328 | self.uncond_prob = uncond_prob 329 | 330 | def token_drop(self, caption, force_drop_ids=None): 331 | """ 332 | Drops labels to enable classifier-free guidance. 333 | """ 334 | if force_drop_ids is None: 335 | drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob 336 | else: 337 | drop_ids = force_drop_ids == 1 338 | caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption) 339 | return caption 340 | 341 | def forward(self, caption, train, force_drop_ids=None): 342 | if train: 343 | assert caption.shape[2:] == self.y_embedding.shape 344 | use_dropout = self.uncond_prob > 0 345 | if (train and use_dropout) or (force_drop_ids is not None): 346 | caption = self.token_drop(caption, force_drop_ids) 347 | caption = self.y_proj(caption) 348 | return caption 349 | 350 | 351 | class CaptionEmbedderDoubleBr(nn.Module): 352 | """ 353 | Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. 354 | """ 355 | 356 | def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120): 357 | super().__init__() 358 | self.proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0) 359 | self.embedding = nn.Parameter(torch.randn(1, in_channels) / 10 ** 0.5) 360 | self.y_embedding = nn.Parameter(torch.randn(token_num, in_channels) / 10 ** 0.5) 361 | self.uncond_prob = uncond_prob 362 | 363 | def token_drop(self, global_caption, caption, force_drop_ids=None): 364 | """ 365 | Drops labels to enable classifier-free guidance. 366 | """ 367 | if force_drop_ids is None: 368 | drop_ids = torch.rand(global_caption.shape[0]).cuda() < self.uncond_prob 369 | else: 370 | drop_ids = force_drop_ids == 1 371 | global_caption = torch.where(drop_ids[:, None], self.embedding, global_caption) 372 | caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption) 373 | return global_caption, caption 374 | 375 | def forward(self, caption, train, force_drop_ids=None): 376 | assert caption.shape[2: ] == self.y_embedding.shape 377 | global_caption = caption.mean(dim=2).squeeze() 378 | use_dropout = self.uncond_prob > 0 379 | if (train and use_dropout) or (force_drop_ids is not None): 380 | global_caption, caption = self.token_drop(global_caption, caption, force_drop_ids) 381 | y_embed = self.proj(global_caption) 382 | return y_embed, caption -------------------------------------------------------------------------------- /lumos_diffusion/model/lumos/__init__.py: -------------------------------------------------------------------------------- 1 | from .LumosI2I import LumosI2I, LumosI2I_XL_2, LumosI2IBlock 2 | from .LumosT2I import LumosT2I, LumosT2I_XL_2, LumosT2IBlock 3 | from .LumosT2IMS import LumosT2IMS, LumosT2IMS_XL_2, LumosT2IMSBlock -------------------------------------------------------------------------------- /lumos_diffusion/model/t5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import re 4 | import html 5 | import urllib.parse as ul 6 | 7 | import ftfy 8 | import torch 9 | from bs4 import BeautifulSoup 10 | from transformers import T5EncoderModel, AutoTokenizer 11 | from huggingface_hub import hf_hub_download 12 | 13 | class T5Embedder: 14 | 15 | available_models = ['t5-v1_1-xxl'] 16 | bad_punct_regex = re.compile(r'['+'#®•©™&@·º½¾¿¡§~'+'\)'+'\('+'\]'+'\['+'\}'+'\{'+'\|'+'\\'+'\/'+'\*' + r']{1,}') # noqa 17 | 18 | def __init__(self, device, dir_or_name='t5-v1_1-xxl', *, local_cache=False, cache_dir=None, hf_token=None, use_text_preprocessing=True, 19 | t5_model_kwargs=None, torch_dtype=None, use_offload_folder=None, model_max_length=120): 20 | self.device = torch.device(device) 21 | self.torch_dtype = torch_dtype or torch.bfloat16 22 | if t5_model_kwargs is None: 23 | t5_model_kwargs = {'low_cpu_mem_usage': True, 'torch_dtype': self.torch_dtype} 24 | if use_offload_folder is not None: 25 | t5_model_kwargs['offload_folder'] = use_offload_folder 26 | t5_model_kwargs['device_map'] = { 27 | 'shared': self.device, 28 | 'encoder.embed_tokens': self.device, 29 | 'encoder.block.0': self.device, 30 | 'encoder.block.1': self.device, 31 | 'encoder.block.2': self.device, 32 | 'encoder.block.3': self.device, 33 | 'encoder.block.4': self.device, 34 | 'encoder.block.5': self.device, 35 | 'encoder.block.6': self.device, 36 | 'encoder.block.7': self.device, 37 | 'encoder.block.8': self.device, 38 | 'encoder.block.9': self.device, 39 | 'encoder.block.10': self.device, 40 | 'encoder.block.11': self.device, 41 | 'encoder.block.12': 'disk', 42 | 'encoder.block.13': 'disk', 43 | 'encoder.block.14': 'disk', 44 | 'encoder.block.15': 'disk', 45 | 'encoder.block.16': 'disk', 46 | 'encoder.block.17': 'disk', 47 | 'encoder.block.18': 'disk', 48 | 'encoder.block.19': 'disk', 49 | 'encoder.block.20': 'disk', 50 | 'encoder.block.21': 'disk', 51 | 'encoder.block.22': 'disk', 52 | 'encoder.block.23': 'disk', 53 | 'encoder.final_layer_norm': 'disk', 54 | 'encoder.dropout': 'disk', 55 | } 56 | else: 57 | t5_model_kwargs['device_map'] = {'shared': self.device, 'encoder': self.device} 58 | 59 | self.use_text_preprocessing = use_text_preprocessing 60 | self.hf_token = hf_token 61 | self.cache_dir = cache_dir or os.path.expanduser('~/.cache/IF_') 62 | self.dir_or_name = dir_or_name 63 | tokenizer_path, path = dir_or_name, dir_or_name 64 | if local_cache: 65 | cache_dir = os.path.join(self.cache_dir, dir_or_name) 66 | tokenizer_path, path = cache_dir, cache_dir 67 | elif dir_or_name in self.available_models: 68 | cache_dir = os.path.join(self.cache_dir, dir_or_name) 69 | for filename in [ 70 | 'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json', 71 | 'pytorch_model.bin.index.json', 'pytorch_model-00001-of-00002.bin', 'pytorch_model-00002-of-00002.bin' 72 | ]: 73 | hf_hub_download(repo_id=f'DeepFloyd/{dir_or_name}', filename=filename, cache_dir=cache_dir, 74 | force_filename=filename, token=self.hf_token) 75 | tokenizer_path, path = cache_dir, cache_dir 76 | else: 77 | cache_dir = os.path.join(self.cache_dir, 't5-v1_1-xxl') 78 | for filename in [ 79 | 'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json', 80 | ]: 81 | hf_hub_download(repo_id='DeepFloyd/t5-v1_1-xxl', filename=filename, cache_dir=cache_dir, 82 | force_filename=filename, token=self.hf_token) 83 | tokenizer_path = cache_dir 84 | 85 | print(tokenizer_path) 86 | self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) 87 | self.model = T5EncoderModel.from_pretrained(path, **t5_model_kwargs).eval() 88 | self.model_max_length = model_max_length 89 | 90 | def get_text_embeddings(self, texts): 91 | texts = [self.text_preprocessing(text) for text in texts] 92 | 93 | text_tokens_and_mask = self.tokenizer( 94 | texts, 95 | max_length=self.model_max_length, 96 | padding='max_length', 97 | truncation=True, 98 | return_attention_mask=True, 99 | add_special_tokens=True, 100 | return_tensors='pt' 101 | ) 102 | 103 | text_tokens_and_mask['input_ids'] = text_tokens_and_mask['input_ids'] 104 | text_tokens_and_mask['attention_mask'] = text_tokens_and_mask['attention_mask'] 105 | 106 | with torch.no_grad(): 107 | text_encoder_embs = self.model( 108 | input_ids=text_tokens_and_mask['input_ids'].to(self.device), 109 | attention_mask=text_tokens_and_mask['attention_mask'].to(self.device), 110 | )['last_hidden_state'].detach() 111 | return text_encoder_embs, text_tokens_and_mask['attention_mask'].to(self.device) 112 | 113 | def text_preprocessing(self, text): 114 | if self.use_text_preprocessing: 115 | # The exact text cleaning as was in the training stage: 116 | text = self.clean_caption(text) 117 | text = self.clean_caption(text) 118 | return text 119 | else: 120 | return text.lower().strip() 121 | 122 | @staticmethod 123 | def basic_clean(text): 124 | text = ftfy.fix_text(text) 125 | text = html.unescape(html.unescape(text)) 126 | return text.strip() 127 | 128 | def clean_caption(self, caption): 129 | caption = str(caption) 130 | caption = ul.unquote_plus(caption) 131 | caption = caption.strip().lower() 132 | caption = re.sub('', 'person', caption) 133 | # urls: 134 | caption = re.sub( 135 | r'\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))', # noqa 136 | '', caption) # regex for urls 137 | caption = re.sub( 138 | r'\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))', # noqa 139 | '', caption) # regex for urls 140 | # html: 141 | caption = BeautifulSoup(caption, features='html.parser').text 142 | 143 | # @ 144 | caption = re.sub(r'@[\w\d]+\b', '', caption) 145 | 146 | # 31C0—31EF CJK Strokes 147 | # 31F0—31FF Katakana Phonetic Extensions 148 | # 3200—32FF Enclosed CJK Letters and Months 149 | # 3300—33FF CJK Compatibility 150 | # 3400—4DBF CJK Unified Ideographs Extension A 151 | # 4DC0—4DFF Yijing Hexagram Symbols 152 | # 4E00—9FFF CJK Unified Ideographs 153 | caption = re.sub(r'[\u31c0-\u31ef]+', '', caption) 154 | caption = re.sub(r'[\u31f0-\u31ff]+', '', caption) 155 | caption = re.sub(r'[\u3200-\u32ff]+', '', caption) 156 | caption = re.sub(r'[\u3300-\u33ff]+', '', caption) 157 | caption = re.sub(r'[\u3400-\u4dbf]+', '', caption) 158 | caption = re.sub(r'[\u4dc0-\u4dff]+', '', caption) 159 | caption = re.sub(r'[\u4e00-\u9fff]+', '', caption) 160 | ####################################################### 161 | 162 | # все виды тире / all types of dash --> "-" 163 | caption = re.sub( 164 | r'[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+', # noqa 165 | '-', caption) 166 | 167 | # кавычки к одному стандарту 168 | caption = re.sub(r'[`´«»“”¨]', '"', caption) 169 | caption = re.sub(r'[‘’]', "'", caption) 170 | 171 | # " 172 | caption = re.sub(r'"?', '', caption) 173 | # & 174 | caption = re.sub(r'&', '', caption) 175 | 176 | # ip adresses: 177 | caption = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' ', caption) 178 | 179 | # article ids: 180 | caption = re.sub(r'\d:\d\d\s+$', '', caption) 181 | 182 | # \n 183 | caption = re.sub(r'\\n', ' ', caption) 184 | 185 | # "#123" 186 | caption = re.sub(r'#\d{1,3}\b', '', caption) 187 | # "#12345.." 188 | caption = re.sub(r'#\d{5,}\b', '', caption) 189 | # "123456.." 190 | caption = re.sub(r'\b\d{6,}\b', '', caption) 191 | # filenames: 192 | caption = re.sub(r'[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)', '', caption) 193 | 194 | # 195 | caption = re.sub(r'[\"\']{2,}', r'"', caption) # """AUSVERKAUFT""" 196 | caption = re.sub(r'[\.]{2,}', r' ', caption) # """AUSVERKAUFT""" 197 | 198 | caption = re.sub(self.bad_punct_regex, r' ', caption) # ***AUSVERKAUFT***, #AUSVERKAUFT 199 | caption = re.sub(r'\s+\.\s+', r' ', caption) # " . " 200 | 201 | # this-is-my-cute-cat / this_is_my_cute_cat 202 | regex2 = re.compile(r'(?:\-|\_)') 203 | if len(re.findall(regex2, caption)) > 3: 204 | caption = re.sub(regex2, ' ', caption) 205 | 206 | caption = self.basic_clean(caption) 207 | 208 | caption = re.sub(r'\b[a-zA-Z]{1,3}\d{3,15}\b', '', caption) # jc6640 209 | caption = re.sub(r'\b[a-zA-Z]+\d+[a-zA-Z]+\b', '', caption) # jc6640vc 210 | caption = re.sub(r'\b\d+[a-zA-Z]+\d+\b', '', caption) # 6640vc231 211 | 212 | caption = re.sub(r'(worldwide\s+)?(free\s+)?shipping', '', caption) 213 | caption = re.sub(r'(free\s)?download(\sfree)?', '', caption) 214 | caption = re.sub(r'\bclick\b\s(?:for|on)\s\w+', '', caption) 215 | caption = re.sub(r'\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?', '', caption) 216 | caption = re.sub(r'\bpage\s+\d+\b', '', caption) 217 | 218 | caption = re.sub(r'\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b', r' ', caption) # j2d1a2a... 219 | 220 | caption = re.sub(r'\b\d+\.?\d*[xх×]\d+\.?\d*\b', '', caption) 221 | 222 | caption = re.sub(r'\b\s+\:\s+', r': ', caption) 223 | caption = re.sub(r'(\D[,\./])\b', r'\1 ', caption) 224 | caption = re.sub(r'\s+', ' ', caption) 225 | 226 | caption.strip() 227 | 228 | caption = re.sub(r'^[\"\']([\w\W]+)[\"\']$', r'\1', caption) 229 | caption = re.sub(r'^[\'\_,\-\:;]', r'', caption) 230 | caption = re.sub(r'[\'\_,\-\:\-\+]$', r'', caption) 231 | caption = re.sub(r'^\.\S+$', '', caption) 232 | 233 | return caption.strip() 234 | -------------------------------------------------------------------------------- /lumos_diffusion/model/timestep_sampler.py: -------------------------------------------------------------------------------- 1 | # Modified from OpenAI's diffusion repos 2 | # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py 3 | # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion 4 | # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py 5 | 6 | from abc import ABC, abstractmethod 7 | 8 | import numpy as np 9 | import torch as th 10 | import torch.distributed as dist 11 | 12 | 13 | def create_named_schedule_sampler(name, diffusion): 14 | """ 15 | Create a ScheduleSampler from a library of pre-defined samplers. 16 | :param name: the name of the sampler. 17 | :param diffusion: the diffusion object to sample for. 18 | """ 19 | if name == "uniform": 20 | return UniformSampler(diffusion) 21 | elif name == "loss-second-moment": 22 | return LossSecondMomentResampler(diffusion) 23 | else: 24 | raise NotImplementedError(f"unknown schedule sampler: {name}") 25 | 26 | 27 | class ScheduleSampler(ABC): 28 | """ 29 | A distribution over timesteps in the diffusion process, intended to reduce 30 | variance of the objective. 31 | By default, samplers perform unbiased importance sampling, in which the 32 | objective's mean is unchanged. 33 | However, subclasses may override sample() to change how the resampled 34 | terms are reweighted, allowing for actual changes in the objective. 35 | """ 36 | 37 | @abstractmethod 38 | def weights(self): 39 | """ 40 | Get a numpy array of weights, one per diffusion step. 41 | The weights needn't be normalized, but must be positive. 42 | """ 43 | 44 | def sample(self, batch_size, device): 45 | """ 46 | Importance-sample timesteps for a batch. 47 | :param batch_size: the number of timesteps. 48 | :param device: the torch device to save to. 49 | :return: a tuple (timesteps, weights): 50 | - timesteps: a tensor of timestep indices. 51 | - weights: a tensor of weights to scale the resulting losses. 52 | """ 53 | w = self.weights() 54 | p = w / np.sum(w) 55 | indices_np = np.random.choice(len(p), size=(batch_size,), p=p) 56 | indices = th.from_numpy(indices_np).long().to(device) 57 | weights_np = 1 / (len(p) * p[indices_np]) 58 | weights = th.from_numpy(weights_np).float().to(device) 59 | return indices, weights 60 | 61 | 62 | class UniformSampler(ScheduleSampler): 63 | def __init__(self, diffusion): 64 | self.diffusion = diffusion 65 | self._weights = np.ones([diffusion.num_timesteps]) 66 | 67 | def weights(self): 68 | return self._weights 69 | 70 | 71 | class LossAwareSampler(ScheduleSampler): 72 | def update_with_local_losses(self, local_ts, local_losses): 73 | """ 74 | Update the reweighting using losses from a model. 75 | Call this method from each rank with a batch of timesteps and the 76 | corresponding losses for each of those timesteps. 77 | This method will perform synchronization to make sure all of the ranks 78 | maintain the exact same reweighting. 79 | :param local_ts: an integer Tensor of timesteps. 80 | :param local_losses: a 1D Tensor of losses. 81 | """ 82 | batch_sizes = [ 83 | th.tensor([0], dtype=th.int32, device=local_ts.device) 84 | for _ in range(dist.get_world_size()) 85 | ] 86 | dist.all_gather( 87 | batch_sizes, 88 | th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device), 89 | ) 90 | 91 | # Pad all_gather batches to be the maximum batch size. 92 | batch_sizes = [x.item() for x in batch_sizes] 93 | max_bs = max(batch_sizes) 94 | 95 | timestep_batches = [th.zeros(max_bs, device=local_ts.device) for _ in batch_sizes] 96 | loss_batches = [th.zeros(max_bs, device=local_losses.device) for _ in batch_sizes] 97 | dist.all_gather(timestep_batches, local_ts) 98 | dist.all_gather(loss_batches, local_losses) 99 | timesteps = [ 100 | x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs] 101 | ] 102 | losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]] 103 | self.update_with_all_losses(timesteps, losses) 104 | 105 | @abstractmethod 106 | def update_with_all_losses(self, ts, losses): 107 | """ 108 | Update the reweighting using losses from a model. 109 | Sub-classes should override this method to update the reweighting 110 | using losses from the model. 111 | This method directly updates the reweighting without synchronizing 112 | between workers. It is called by update_with_local_losses from all 113 | ranks with identical arguments. Thus, it should have deterministic 114 | behavior to maintain state across workers. 115 | :param ts: a list of int timesteps. 116 | :param losses: a list of float losses, one per timestep. 117 | """ 118 | 119 | 120 | class LossSecondMomentResampler(LossAwareSampler): 121 | def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001): 122 | self.diffusion = diffusion 123 | self.history_per_term = history_per_term 124 | self.uniform_prob = uniform_prob 125 | self._loss_history = np.zeros( 126 | [diffusion.num_timesteps, history_per_term], dtype=np.float64 127 | ) 128 | self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int) 129 | 130 | def weights(self): 131 | if not self._warmed_up(): 132 | return np.ones([self.diffusion.num_timesteps], dtype=np.float64) 133 | weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1)) 134 | weights /= np.sum(weights) 135 | weights *= 1 - self.uniform_prob 136 | weights += self.uniform_prob / len(weights) 137 | return weights 138 | 139 | def update_with_all_losses(self, ts, losses): 140 | for t, loss in zip(ts, losses): 141 | if self._loss_counts[t] == self.history_per_term: 142 | # Shift out the oldest loss term. 143 | self._loss_history[t, :-1] = self._loss_history[t, 1:] 144 | self._loss_history[t, -1] = loss 145 | else: 146 | self._loss_history[t, self._loss_counts[t]] = loss 147 | self._loss_counts[t] += 1 148 | 149 | def _warmed_up(self): 150 | return (self._loss_counts == self.history_per_term).all() 151 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.29.2 2 | beautifulsoup4==4.12.3 3 | diffusers==0.27.2 4 | einops==0.8.0 5 | ftfy==6.3.1 6 | huggingface-hub==0.23.3 7 | mmcv==1.7.0 8 | numpy==1.23.5 9 | protobuf==5.28.3 10 | sentencepiece==0.2.0 11 | tqdm==4.66.4 12 | timm==0.9.16 13 | transformers==4.39.3 14 | gradio==4.40.0 -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .download import * 2 | from .resolution import * -------------------------------------------------------------------------------- /utils/download.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | 4 | def find_model(model_name): 5 | """ 6 | Finds a pre-trained G.pt model, downloading it if necessary. Alternatively, loads a model from a local path. 7 | """ 8 | assert os.path.isfile(model_name), f'Could not find checkpoint at {model_name}' 9 | return torch.load(model_name, map_location=lambda storage, loc: storage) 10 | 11 | -------------------------------------------------------------------------------- /utils/resolution.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | ASPECT_RATIO_1024 = { 4 | '0.25': [512., 2048.], '0.26': [512., 1984.], '0.27': [512., 1920.], '0.28': [512., 1856.], 5 | '0.32': [576., 1792.], '0.33': [576., 1728.], '0.35': [576., 1664.], '0.4': [640., 1600.], 6 | '0.42': [640., 1536.], '0.48': [704., 1472.], '0.5': [704., 1408.], '0.52': [704., 1344.], 7 | '0.57': [768., 1344.], '0.6': [768., 1280.], '0.68': [832., 1216.], '0.72': [832., 1152.], 8 | '0.78': [896., 1152.], '0.82': [896., 1088.], '0.88': [960., 1088.], '0.94': [960., 1024.], 9 | '1.0': [1024., 1024.], '1.07': [1024., 960.], '1.13': [1088., 960.], '1.21': [1088., 896.], 10 | '1.29': [1152., 896.], '1.38': [1152., 832.], '1.46': [1216., 832.], '1.67': [1280., 768.], 11 | '1.75': [1344., 768.], '2.0': [1408., 704.], '2.09': [1472., 704.], '2.4': [1536., 640.], 12 | '2.5': [1600., 640.], '2.89': [1664., 576.], '3.0': [1728., 576.], '3.11': [1792., 576.], 13 | '3.62': [1856., 512.], '3.75': [1920., 512.], '3.88': [1984., 512.], '4.0': [2048., 512.], 14 | } 15 | 16 | ASPECT_RATIO_512 = { 17 | '0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0], 18 | '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0], 19 | '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0], 20 | '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0], 21 | '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0], 22 | '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0], 23 | '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0], 24 | '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0], 25 | '2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0], 26 | '3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0] 27 | } 28 | 29 | ASPECT_RATIO_256 = { 30 | '0.25': [128.0, 512.0], '0.26': [128.0, 496.0], '0.27': [128.0, 480.0], '0.28': [128.0, 464.0], 31 | '0.32': [144.0, 448.0], '0.33': [144.0, 432.0], '0.35': [144.0, 416.0], '0.4': [160.0, 400.0], 32 | '0.42': [160.0, 384.0], '0.48': [176.0, 368.0], '0.5': [176.0, 352.0], '0.52': [176.0, 336.0], 33 | '0.57': [192.0, 336.0], '0.6': [192.0, 320.0], '0.68': [208.0, 304.0], '0.72': [208.0, 288.0], 34 | '0.78': [224.0, 288.0], '0.82': [224.0, 272.0], '0.88': [240.0, 272.0], '0.94': [240.0, 256.0], 35 | '1.0': [256.0, 256.0], '1.07': [256.0, 240.0], '1.13': [272.0, 240.0], '1.21': [272.0, 224.0], 36 | '1.29': [288.0, 224.0], '1.38': [288.0, 208.0], '1.46': [304.0, 208.0], '1.67': [320.0, 192.0], 37 | '1.75': [336.0, 192.0], '2.0': [352.0, 176.0], '2.09': [368.0, 176.0], '2.4': [384.0, 160.0], 38 | '2.5': [400.0, 160.0], '2.89': [416.0, 144.0], '3.0': [432.0, 144.0], '3.11': [448.0, 144.0], 39 | '3.62': [464.0, 128.0], '3.75': [480.0, 128.0], '3.88': [496.0, 128.0], '4.0': [512.0, 128.0] 40 | } 41 | 42 | ASPECT_RATIO_256_TEST = { 43 | '0.25': [128.0, 512.0], '0.28': [128.0, 464.0], 44 | '0.32': [144.0, 448.0], '0.33': [144.0, 432.0], '0.35': [144.0, 416.0], '0.4': [160.0, 400.0], 45 | '0.42': [160.0, 384.0], '0.48': [176.0, 368.0], '0.5': [176.0, 352.0], '0.52': [176.0, 336.0], 46 | '0.57': [192.0, 336.0], '0.6': [192.0, 320.0], '0.68': [208.0, 304.0], '0.72': [208.0, 288.0], 47 | '0.78': [224.0, 288.0], '0.82': [224.0, 272.0], '0.88': [240.0, 272.0], '0.94': [240.0, 256.0], 48 | '1.0': [256.0, 256.0], '1.07': [256.0, 240.0], '1.13': [272.0, 240.0], '1.21': [272.0, 224.0], 49 | '1.29': [288.0, 224.0], '1.38': [288.0, 208.0], '1.46': [304.0, 208.0], '1.67': [320.0, 192.0], 50 | '1.75': [336.0, 192.0], '2.0': [352.0, 176.0], '2.09': [368.0, 176.0], '2.4': [384.0, 160.0], 51 | '2.5': [400.0, 160.0], '3.0': [432.0, 144.0], 52 | '4.0': [512.0, 128.0] 53 | } 54 | 55 | ASPECT_RATIO_512_TEST = { 56 | '0.25': [256.0, 1024.0], '0.28': [256.0, 928.0], 57 | '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0], 58 | '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0], 59 | '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0], 60 | '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0], 61 | '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0], 62 | '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0], 63 | '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0], 64 | '2.5': [800.0, 320.0], '3.0': [864.0, 288.0], 65 | '4.0': [1024.0, 256.0] 66 | } 67 | 68 | ASPECT_RATIO_1024_TEST = { 69 | '0.25': [512., 2048.], '0.28': [512., 1856.], 70 | '0.32': [576., 1792.], '0.33': [576., 1728.], '0.35': [576., 1664.], '0.4': [640., 1600.], 71 | '0.42': [640., 1536.], '0.48': [704., 1472.], '0.5': [704., 1408.], '0.52': [704., 1344.], 72 | '0.57': [768., 1344.], '0.6': [768., 1280.], '0.68': [832., 1216.], '0.72': [832., 1152.], 73 | '0.78': [896., 1152.], '0.82': [896., 1088.], '0.88': [960., 1088.], '0.94': [960., 1024.], 74 | '1.0': [1024., 1024.], '1.07': [1024., 960.], '1.13': [1088., 960.], '1.21': [1088., 896.], 75 | '1.29': [1152., 896.], '1.38': [1152., 832.], '1.46': [1216., 832.], '1.67': [1280., 768.], 76 | '1.75': [1344., 768.], '2.0': [1408., 704.], '2.09': [1472., 704.], '2.4': [1536., 640.], 77 | '2.5': [1600., 640.], '3.0': [1728., 576.], 78 | '4.0': [2048., 512.], 79 | } 80 | 81 | 82 | def get_chunks(lst, n): 83 | for i in range(0, len(lst), n): 84 | yield lst[i:i + n] 85 | 86 | def get_closest_ratio(height: float, width: float, ratios: dict): 87 | aspect_ratio = height / width 88 | closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio)) 89 | return ratios[closest_ratio], float(closest_ratio) 90 | --------------------------------------------------------------------------------