├── tag_content_process.py └── README.md /tag_content_process.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | folder_path = '.../text-prompts' 4 | # This folder contains original text prompt files generated by BLIP. 5 | save_folder_path = '.../processed-text-prompts' 6 | # This folder contains processed text prompt files (removing poor tags and adding trigger word). 7 | 8 | files_list = os.listdir(folder_path) 9 | files_num = len(files_list) 10 | template = '360' 11 | 12 | for ii in range(files_num): 13 | 14 | file_path = os.path.join(folder_path, files_list[ii]) 15 | file_save_path = os.path.join(save_folder_path, files_list[ii]) 16 | 17 | with open(file_path, 'r', encoding='utf-8') as file: 18 | line = file.readline() 19 | 20 | strings = line.split(',') 21 | string_num = len(strings) 22 | 23 | strings_new = [] 24 | 25 | strings_new += '360-degree panoramic image, ' 26 | for jj in range(string_num): 27 | 28 | string_processed = strings[jj].replace(' ', '') 29 | if (string_processed.find(template) == -1): 30 | strings_new += strings[jj] 31 | strings_new += ',' 32 | 33 | strings_new = strings_new[:-1] 34 | 35 | with open(file_save_path, 'w', encoding='utf-8') as updated_file: 36 | updated_file.writelines(strings_new) 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # StitchDiffusion (Keep Update) 2 | Customizing 360-Degree Panoramas through Text-to-Image Diffusion Models \ 3 | [Hai Wang](https://littlewhitesea.github.io/), [Xiaoyu Xiang](https://xiaoyux1ang.github.io/), [Yuchen Fan](https://ychfan.github.io/), [Jing-Hao Xue](https://www.homepages.ucl.ac.uk/~ucakjxu/) 4 | 5 | [![Project](https://img.shields.io/badge/Project-Website-orange)](https://littlewhitesea.github.io/stitchdiffusion.github.io/) 6 | [![arXiv](https://img.shields.io/badge/arXiv-2310.18840-b31b1b.svg)](https://arxiv.org/abs/2310.18840) 7 | 8 | ### [Data](https://drive.google.com/file/d/1EgRwj5BqO7Y-PvdL8mrFwKsqmgN_N4_b/view?usp=sharing) | [Pretrained Model](https://drive.google.com/file/d/1MiaG8v0ZmkTwwrzIEFtVoBj-Jjqi_5lz/view?usp=sharing) | [8K Data](https://drive.google.com/file/d/1RFfLH6zzwsd3rlRRxuN-RtBWv6BsWXMO/view?usp=sharing) 9 | 10 | ## [[Runnable code]](https://github.com/lshus/stitchdiffusion-colab) based on diffusers was implemented by @[lshus](https://github.com/lshus). 11 | ~~[Colab](https://github.com/lshus/stitchdiffusion-colab) was implemented by @[lshus](https://github.com/lshus).~~ 12 | 13 | ## StitchDiffusion Code 14 | 15 | Actually, StitchDiffusion is a tailored generation (denoising) process for synthesizing 360-degree panoramas, we provide its core code here. 16 | 17 | ```python 18 | ## following MultiDiffusion: https://github.com/omerbt/MultiDiffusion/blob/master/panorama.py ## 19 | ## the window size is changed for 360-degree panorama generation ## 20 | def get_views(panorama_height, panorama_width, window_size=[64,128], stride=16): 21 | panorama_height /= 8 22 | panorama_width /= 8 23 | num_blocks_height = (panorama_height - window_size[0]) // stride + 1 24 | num_blocks_width = (panorama_width - window_size[1]) // stride + 1 25 | total_num_blocks = int(num_blocks_height * num_blocks_width) 26 | views = [] 27 | for i in range(total_num_blocks): 28 | h_start = int((i // num_blocks_width) * stride) 29 | h_end = h_start + window_size[0] 30 | w_start = int((i % num_blocks_width) * stride) 31 | w_end = w_start + window_size[1] 32 | views.append((h_start, h_end, w_start, w_end)) 33 | return views 34 | ``` 35 | 36 | ```python 37 | ##################### 38 | ## StitchDiffusion ## 39 | ##################### 40 | 41 | views_t = get_views(height, width) # height = 512; width = 4*height = 2048 42 | count_t = torch.zeros_like(latents) 43 | value_t = torch.zeros_like(latents) 44 | # latents are sampled from standard normal distribution (torch.randn()) with a size of Bx4x64x256, 45 | # where B denotes the batch size. 46 | 47 | for i, t in enumerate(tqdm(timesteps)): 48 | 49 | count_t.zero_() 50 | value_t.zero_() 51 | 52 | # initialize the value of latent_view_t 53 | latent_view_t = latents[:, :, :, 64:192] 54 | 55 | #### pre-denoising operations twice on the stitch block #### 56 | for ii_md in range(2): 57 | 58 | latent_view_t[:, :, :, 0:64] = latents[:, :, :, 192:256] #left part of the stitch block 59 | latent_view_t[:, :, :, 64:128] = latents[:, :, :, 0:64] #right part of the stitch block 60 | 61 | # expand the latents if we are doing classifier free guidance 62 | latent_model_input = latent_view_t.repeat((2, 1, 1, 1)) 63 | 64 | # # predict the noise residual 65 | noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)['sample'] 66 | 67 | # perform guidance 68 | noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) 69 | noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) 70 | 71 | # compute the denoising step with the reference (customized) model 72 | latent_view_denoised = self.scheduler.step(noise_pred, t, latent_view_t)['prev_sample'] 73 | 74 | value_t[:, :, :, 192:256] += latent_view_denoised[:, :, :, 0:64] 75 | count_t[:, :, :, 192:256] += 1 76 | 77 | value_t[:, :, :, 0:64] += latent_view_denoised[:, :, :, 64:128] 78 | count_t[:, :, :, 0:64] += 1 79 | 80 | # same denoising operations as what MultiDiffusion does 81 | for h_start, h_end, w_start, w_end in views_t: 82 | 83 | latent_view_t = latents[:, :, h_start:h_end, w_start:w_end] 84 | 85 | # expand the latents if we are doing classifier free guidance 86 | latent_model_input = latent_view_t.repeat((2, 1, 1, 1)) 87 | latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) 88 | 89 | # predict the noise residual 90 | noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)['sample'] 91 | 92 | #perform guidance 93 | noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) 94 | noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) 95 | 96 | # compute the denoising step with the reference (customized) model 97 | latent_view_denoised = self.scheduler.step(noise_pred, t, latent_view_t)['prev_sample'] 98 | value_t[:, :, h_start:h_end, w_start:w_end] += latent_view_denoised 99 | count_t[:, :, h_start:h_end, w_start:w_end] += 1 100 | 101 | latents = torch.where(count_t > 0, value_t / count_t, value_t) 102 | 103 | latents = 1 / 0.18215 * latents 104 | image = self.vae.decode(latents).sample 105 | image = (image / 2 + 0.5).clamp(0, 1) 106 | 107 | 108 | #### global cropping operation #### 109 | image = image[:, :, :, 512:1536] 110 | image = image.cpu().permute(0, 2, 3, 1).float().numpy() 111 | ``` 112 | 113 | ## Useful Tools 114 | 115 | [360 panoramic images viewer](https://renderstuff.com/tools/360-panorama-web-viewer/): It could be used to view the synthesized 360-degree panorama. 116 | 117 | [Seamless Texture Checker](https://www.pycheung.com/checker/): It could be employed to check the continuity between the leftmost and rightmost sides of the generated image. 118 | 119 | [clip-interrogator](https://github.com/pharmapsychotic/clip-interrogator?tab=readme-ov-file): It contains Google Colab of BLIP to generate text prompts. 120 | 121 | [CLIP](https://github.com/OpenAI/CLIP): It contains Google Colab to calculate the CLIP-score. 122 | 123 | [FID](https://github.com/GaParmar/clean-fid): It contains Google Colab to calculate FID. 124 | 125 | ## Statement 126 | This research was done by Hai Wang in University College London. The code and released models are owned by Hai Wang. 127 | 128 | ## Citation 129 | If you find the code helpful in your research or work, please cite our paper: 130 | ```Bibtex 131 | @inproceedings{wang2024customizing, 132 | title={Customizing 360-Degree Panoramas through Text-to-Image Diffusion Models}, 133 | author={Wang, Hai and Xiang, Xiaoyu and Fan, Yuchen and Xue, Jing-Hao}, 134 | booktitle={Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision}, 135 | pages={4933--4943}, 136 | year={2024} 137 | } 138 | ``` 139 | ## Acknowledgments 140 | We thank [MultiDiffusion](https://github.com/omerbt/MultiDiffusion). Our work is based on their excellent codes. 141 | --------------------------------------------------------------------------------