├── IQA.py
├── README.md
├── Word2Vec
    └── .gitkeep
├── classification.py
├── examples
    ├── framework.png
    ├── ptp_utils.py
    └── seq_aligner.py
├── get_object_attention_mask.py
├── mini_100.txt
├── modified_clip.py
├── modified_stable_diffusion_pipeline.py
├── object_config.json
├── optim_utils.py
├── perceptrontagger_model
    └── averaged_perceptron_tagger.pickle
├── pos_tagger.py
├── referenced_images
    ├── objects
    │   ├── cock
    │   │   ├── 0.png
    │   │   ├── 1.png
    │   │   ├── 2.png
    │   │   ├── 3.png
    │   │   ├── 4.png
    │   │   ├── 5.png
    │   │   ├── 6.png
    │   │   ├── 7.png
    │   │   ├── 8.png
    │   │   └── 9.png
    │   ├── forbidden_words.txt
    │   ├── mushroom
    │   │   ├── 0.jpg
    │   │   ├── 1.jpg
    │   │   ├── 2.jpg
    │   │   ├── 3.jpg
    │   │   ├── 4.jpg
    │   │   ├── 5.jpg
    │   │   ├── 6.jpg
    │   │   ├── 7.jpg
    │   │   ├── 8.jpg
    │   │   └── 9.jpg
    │   ├── panda
    │   │   ├── 0.jpg
    │   │   ├── 1.jpg
    │   │   ├── 2.jpg
    │   │   ├── 3.jpg
    │   │   ├── 4.jpg
    │   │   ├── 5.jpg
    │   │   ├── 6.jpg
    │   │   ├── 7.jpg
    │   │   ├── 8.jpg
    │   │   └── 9.jpg
    │   ├── peony
    │   │   ├── 0.jpg
    │   │   ├── 1.jpg
    │   │   ├── 2.jpg
    │   │   ├── 3.jpg
    │   │   ├── 4.jpg
    │   │   ├── 5.jpg
    │   │   ├── 6.jpg
    │   │   ├── 7.jpg
    │   │   ├── 8.jpg
    │   │   └── 9.jpg
    │   ├── pizza
    │   │   ├── 0.jpg
    │   │   ├── 1.jpg
    │   │   ├── 2.jpg
    │   │   ├── 3.jpg
    │   │   ├── 4.jpg
    │   │   ├── 5.jpg
    │   │   ├── 6.jpg
    │   │   ├── 7.jpg
    │   │   ├── 8.jpg
    │   │   └── 9.jpg
    │   ├── target_object.txt
    │   ├── toucan
    │   │   ├── 0.jpg
    │   │   ├── 1.jpg
    │   │   ├── 2.jpg
    │   │   ├── 3.jpg
    │   │   ├── 4.jpg
    │   │   ├── 5.jpg
    │   │   ├── 6.jpg
    │   │   ├── 7.jpg
    │   │   ├── 8.jpg
    │   │   └── 9.jpg
    │   ├── tractor
    │   │   ├── 0.jpg
    │   │   ├── 1.jpg
    │   │   ├── 2.jpg
    │   │   ├── 3.jpg
    │   │   ├── 4.jpg
    │   │   ├── 5.jpg
    │   │   ├── 6.jpg
    │   │   ├── 7.jpg
    │   │   ├── 8.jpg
    │   │   └── 9.jpg
    │   ├── vampire
    │   │   ├── 0.jpg
    │   │   ├── 1.jpg
    │   │   ├── 2.jpg
    │   │   ├── 3.jpg
    │   │   ├── 4.jpg
    │   │   ├── 5.jpg
    │   │   ├── 6.jpg
    │   │   ├── 7.jpg
    │   │   ├── 8.jpg
    │   │   └── 9.jpg
    │   ├── warplane
    │   │   ├── 0.jpg
    │   │   ├── 1.jpg
    │   │   ├── 2.jpg
    │   │   ├── 3.jpg
    │   │   ├── 4.jpg
    │   │   ├── 5.jpg
    │   │   ├── 6.jpg
    │   │   ├── 7.jpg
    │   │   ├── 8.jpg
    │   │   └── 9.jpg
    │   └── zombie
    │   │   ├── 0.jpg
    │   │   ├── 1.jpg
    │   │   ├── 2.jpg
    │   │   ├── 3.jpg
    │   │   ├── 4.jpg
    │   │   ├── 5.jpg
    │   │   ├── 6.jpg
    │   │   ├── 7.jpg
    │   │   ├── 8.jpg
    │   │   └── 9.jpg
    └── style
    │   ├── animation
    │       ├── Amongst_the_ruins,_a_leopard_is_roaring.jpg
    │       ├── An_Indian_elephant_adorned_with_a_golden_saddle_paraded_through_the_forest.jpg
    │       ├── The_English_setter_gracefully_pointed_towards_hidden_game_in_the_tall_grass.jpg
    │       ├── The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg
    │       ├── The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg
    │       ├── The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg
    │       ├── a_photo_of_consomme.jpg
    │       ├── a_photo_of_garbage_truck.jpg
    │       ├── a_photo_of_hotdog.jpg
    │       ├── a_photo_of_laptop.jpg
    │       ├── a_photo_of_military_uniform.jpg
    │       ├── a_photo_of_submarine.jpg
    │       ├── a_photo_of_tractor.jpg
    │       ├── a_photo_of_violin.jpg
    │       └── a_ram_grazed_peacefully_under_the_stars.jpg
    │   ├── forbidden_words.txt
    │   ├── oil painting
    │       ├── Amongst_the_ruins,_a_leopard_is_roaring.jpg
    │       ├── An_Indian_elephant_adorned_with_a_golden_saddle_paraded_through_the_forest.jpg
    │       ├── The_English_setter_gracefully_pointed_towards_hidden_game_in_the_tall_grass.jpg
    │       ├── The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg
    │       ├── The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg
    │       ├── The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg
    │       ├── a_photo_of_consomme.jpg
    │       ├── a_photo_of_garbage_truck.jpg
    │       ├── a_photo_of_hotdog.jpg
    │       ├── a_photo_of_laptop.jpg
    │       ├── a_photo_of_military_uniform.jpg
    │       ├── a_photo_of_submarine.jpg
    │       ├── a_photo_of_tractor.jpg
    │       ├── a_photo_of_violin.jpg
    │       └── a_ram_grazed_peacefully_under_the_stars.jpg
    │   ├── sketch
    │       ├── Amongst_the_ruins,_a_leopard_is_roaring.jpg
    │       ├── An_Indian_elephant_paraded_through_the_forest.jpg
    │       ├── The_English_setter_gracefully_ran_in_the_tall_grass.jpg
    │       ├── The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg
    │       ├── The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg
    │       ├── The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg
    │       ├── a_photo_of_consomme.jpg
    │       ├── a_photo_of_garbage_truck.jpg
    │       ├── a_photo_of_hotdog.jpg
    │       ├── a_photo_of_laptop.jpg
    │       ├── a_photo_of_military_uniform.jpg
    │       ├── a_photo_of_submarine.jpg
    │       ├── a_photo_of_tractor.jpg
    │       ├── a_photo_of_violin.jpg
    │       └── a_ram_grazed_peacefully_under_the_stars.jpg
    │   ├── target_style.txt
    │   └── watercolor
    │       ├── Amongst_the_ruins,_a_leopard_is_roaring.jpg
    │       ├── An_Indian_elephant_paraded_through_the_forest.jpg
    │       ├── The_English_setter_gracefully_ran_in_the_tall_grass.jpg
    │       ├── The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg
    │       ├── The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg
    │       ├── The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg
    │       ├── a_photo_of_consomme.jpg
    │       ├── a_photo_of_garbage_truck.jpg
    │       ├── a_photo_of_hotdog.jpg
    │       ├── a_photo_of_laptop.jpg
    │       ├── a_photo_of_military_uniform.jpg
    │       ├── a_photo_of_submarine.jpg
    │       ├── a_photo_of_tractor.jpg
    │       ├── a_photo_of_violin.jpg
    │       └── a_ram_grazed_peacefully_under_the_stars.jpg
├── requirements.txt
├── run.py
├── simple_prompt.txt
├── style_config.json
├── synonym.py
├── test_object_multi.py
└── test_style_multi.py


/IQA.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torchvision.models import inception_v3, Inception_V3_Weights
  4 | from torchvision.transforms import ToTensor
  5 | import argparse
  6 | import os
  7 | from PIL import Image
  8 | from shutil import move
  9 | import tqdm
 10 | from scipy import linalg
 11 | from test_style import metric
 12 | from classification import ClassificationModel
 13 | import numpy as np
 14 | import scipy
 15 | import pdb
 16 | from PIL import ImageFile
 17 | 
 18 | ImageFile.LOAD_TRUNCATED_IMAGES = True
 19 | 
 20 | 
 21 | class InceptionV3(nn.Module):
 22 |     def __init__(self):
 23 |         super().__init__()
 24 |         inception = inception_v3(weights=Inception_V3_Weights.IMAGENET1K_V1)
 25 |         self.block1 = nn.Sequential(
 26 |             inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3,
 27 |             inception.Conv2d_2b_3x3,
 28 |             nn.MaxPool2d(kernel_size=3, stride=2))
 29 |         self.block2 = nn.Sequential(
 30 |             inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3,
 31 |             nn.MaxPool2d(kernel_size=3, stride=2))
 32 |         self.block3 = nn.Sequential(
 33 |             inception.Mixed_5b, inception.Mixed_5c,
 34 |             inception.Mixed_5d, inception.Mixed_6a,
 35 |             inception.Mixed_6b, inception.Mixed_6c,
 36 |             inception.Mixed_6d, inception.Mixed_6e)
 37 |         self.block4 = nn.Sequential(
 38 |             inception.Mixed_7a, inception.Mixed_7b,
 39 |             inception.Mixed_7c,
 40 |             nn.AdaptiveAvgPool2d(output_size=(1, 1)))
 41 |         self.dropout = inception.dropout
 42 |         self.fc = inception.fc
 43 | 
 44 |     def forward(self, x):
 45 |         x = self.block1(x)
 46 |         x = self.block2(x)
 47 |         x = self.block3(x)
 48 |         x = self.block4(x)
 49 |         flatten = torch.flatten(self.dropout(x), 1)
 50 |         preds = self.fc(flatten)
 51 |         return x.view(x.size(0), -1), preds
 52 | 
 53 | 
 54 | def inception_score(images, batch_size=100, splits=10):
 55 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 56 |     model = inception_v3(weights=True, transform_input=False).to(device)
 57 |     model.eval()
 58 | 
 59 |     scores = []
 60 |     entropys = []
 61 |     num_batches = len(images) // batch_size
 62 | 
 63 |     with torch.no_grad():
 64 |         for i in tqdm.tqdm(range(num_batches)):
 65 |             batch = torch.stack([ToTensor()(img).to(device) for img in images[i * batch_size:(i + 1) * batch_size]])
 66 |             preds = nn.Softmax(dim=1)(model(batch))
 67 |             p_yx = preds.log()
 68 |             p_y = preds.mean(dim=0).log()
 69 |             entropy = torch.sum(preds * p_yx, dim=1).mean()
 70 |             kl_divergence = torch.sum(preds * (p_yx - p_y), dim=1).mean()
 71 |             entropys.append(torch.exp(entropy))
 72 |             scores.append(torch.exp(kl_divergence))
 73 |     entropys = torch.stack(entropys)
 74 |     mean_entropys = entropys.mean()
 75 |     std_entropys = entropys.std(dim=-1)
 76 | 
 77 |     scores = torch.stack(scores)
 78 |     mean_score = scores.mean()
 79 |     std_score = scores.std(dim=-1)
 80 |     return mean_score.item(), std_score.item(), mean_entropys, std_entropys
 81 | 
 82 | 
 83 | def calculate_activation_statistics(images, model):
 84 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 85 |     model.to(device)
 86 |     model.eval()
 87 | 
 88 |     act_values = []
 89 | 
 90 |     with torch.no_grad():
 91 |         for img in tqdm.tqdm(images):
 92 |             img = ToTensor()(img).unsqueeze(0).to(device)
 93 |             act = model(img).detach().cpu()
 94 |             act_values.append(act)
 95 | 
 96 |     act_values = torch.cat(act_values, dim=0).detach().numpy()
 97 |     mu = np.mean(act_values, axis=0)
 98 |     sigma = np.cov(act_values, rowvar=False)
 99 | 
100 |     return mu, sigma
101 | 
102 | 
103 | def frechet_distance(mu, cov, mu2, cov2):
104 |     cc, _ = linalg.sqrtm(np.dot(cov, cov2), disp=False)
105 |     dist = np.sum((mu - mu2) ** 2) + np.trace(cov + cov2 - 2 * cc)
106 |     return np.real(dist)
107 | 
108 | 
109 | def frechet_inception_distance(real_images, generated_images, batch_size=32):
110 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
111 |     model = InceptionV3().to(device)
112 |     model.eval()
113 |     print("get features of the real images")
114 |     mu_real, sigma_real = calculate_activation_statistics(real_images, model)
115 |     print("get features of the generated images")
116 |     mu_fake, sigma_fake = calculate_activation_statistics(generated_images, model)
117 | 
118 |     fid_score = frechet_distance(mu_real, sigma_real, mu_fake, sigma_fake)
119 |     return fid_score.item()
120 | 
121 | 
122 | def get_model_outputs(model, images, batch_size=100):
123 |     preds, act_values = [], []
124 |     num_batches = len(images) // batch_size
125 |     assert num_batches * batch_size == len(images)
126 |     with torch.no_grad():
127 |         for i in tqdm.tqdm(range(num_batches)):
128 |             batch = torch.stack([ToTensor()(img).to(device) for img in images[i * batch_size:(i + 1) * batch_size]])
129 |             act, pred = model(batch)
130 |             pred = nn.Softmax(dim=1)(pred)
131 |             act_values.append(act)
132 |             preds.append(pred)
133 | 
134 |     act_values = torch.stack(act_values, dim=0).view(-1, act.size(-1))
135 |     preds = torch.cat(preds, dim=0).view(-1, pred.size(-1))
136 | 
137 |     return act_values.cpu().numpy(), preds.cpu().numpy()
138 | 
139 | if __name__ == "__main__":
140 |     parser = argparse.ArgumentParser()
141 |     parser.add_argument('--gen_img_path', type=str, required=True, help='the path of generated images')
142 |     parser.add_argument('--task', type=str, default='object', help='object task or style task')
143 |     parser.add_argument('--attack_goal_path', type=str, default=None, help='the path of referenced images')
144 |     parser.add_argument('--metric', type=str, default='image_quality', help='[image_quality, attack_acc]')
145 |     parser.add_argument('--depth', type=int, default=4, help='3 or 4 for the dir depth')
146 |     args = parser.parse_args()
147 | 
148 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
149 |     model = InceptionV3().to(device)
150 |     model.eval()
151 | 
152 |     gen_imgs = []
153 |     gen_img_ori_labels = []
154 |     gen_img_goal_labels = []
155 |     if args.depth == 4:
156 |         for goal_dir in os.listdir(args.gen_img_path):
157 |             goal_path = os.path.join(args.gen_img_path, goal_dir)
158 |             if not os.path.isdir(goal_path):
159 |                 continue
160 |             image_num_per_goal = 0
161 |             if os.path.isdir(goal_path):
162 |                 for prompt_dir in os.listdir(goal_path):
163 |                     cur_path = os.path.join(goal_path, prompt_dir)
164 |                     if os.path.isdir(cur_path):
165 |                         for adv_prompt in os.listdir(cur_path):
166 |                             img_dir = os.path.join(cur_path, adv_prompt)
167 |                             for img_name in os.listdir(img_dir):
168 |                                 assert img_name.endswith(".png")
169 |                                 gen_imgs.append(Image.open(os.path.join(img_dir, img_name)))
170 |                                 gen_img_ori_labels.append(prompt_dir.split("_")[0])
171 |                                 gen_img_goal_labels.append(goal_dir)
172 |                                 image_num_per_goal += 1
173 | 
174 |                 print(f"{goal_dir} goal has {image_num_per_goal} images")
175 |     elif args.depth == 3:
176 |         for prompt_dir in os.listdir(args.gen_img_path):
177 |             cur_path = os.path.join(args.gen_img_path, prompt_dir)
178 |             if os.path.isdir(cur_path):
179 |                 for adv_prompt in os.listdir(cur_path):
180 |                     if prompt_dir.replace("-", "_-_").replace(" ", "_").lower() in adv_prompt:
181 |                         img_dir = os.path.join(cur_path, adv_prompt)
182 |                         print(f"{prompt_dir} has {len(os.listdir(img_dir))} images")
183 |                         for img_name in os.listdir(img_dir):
184 |                             assert img_name.endswith(".png")
185 |                             gen_imgs.append(Image.open(os.path.join(img_dir, img_name)))
186 |                             gen_img_ori_labels.append(prompt_dir)
187 |                             gen_img_goal_labels.append(args.gen_img_path.split("/")[-1])
188 |                     else:
189 |                         pdb.set_trace()
190 |     print(f"Total has {len(gen_imgs)} generated images")
191 | 
192 |     if args.metric == "image_quality":
193 |         gen_images_num_per_goal = 100 * 10  # 100 prompt, 10 images per prompt
194 |         goal_num = len(gen_imgs) // gen_images_num_per_goal
195 |         with open(args.attack_goal_path, "r") as f:
196 |             real_img_path = f.readlines()
197 |             goals = [goal_path.split("/")[-1].lower().strip() for goal_path in real_img_path]
198 |         assert goal_num == len(real_img_path), "The goal num of generate images do not equal to the category number " \
199 |                                                f"of target images, but get {goal_num} goal num and {len(real_img_path)} " \
200 |                                                f"category num"
201 |         total_IS, total_FID = [], []
202 |         for i in range(goal_num):
203 |             gen_images_per_goal = gen_imgs[i * gen_images_num_per_goal:(i + 1) * gen_images_num_per_goal]
204 |             gen_images_goal = gen_img_goal_labels[i * gen_images_num_per_goal:(i + 1) * gen_images_num_per_goal]
205 |             assert len(
206 |                 set(gen_images_goal)) == 1, f"multi goals happened in computing FID score, {set(gen_images_goal)}"
207 |             goal = gen_images_goal[0]
208 |             goal_img_path = real_img_path[goals.index(goal)].strip()
209 |             real_imgs = []
210 |             for img_name in os.listdir(goal_img_path):
211 |                 img_path = os.path.join(goal_img_path, img_name)
212 |                 real_imgs.append(Image.open(img_path))
213 |             print(f"Total has {len(real_imgs)} real images in {goal} goal")
214 | 
215 |             gen_acts, gen_preds = get_model_outputs(model, gen_images_per_goal, batch_size=10)
216 |             mu_gen = np.mean(gen_acts, axis=0)
217 |             sigma_gen = np.cov(gen_acts, rowvar=False)
218 | 
219 |             ## IS score
220 |             IS_batch_size = 10
221 |             IS_batch = len(gen_images_per_goal) // IS_batch_size
222 |             split_entropys, split_scores = [], []
223 |             for i in range(IS_batch):
224 |                 cur_preds = gen_preds[i * IS_batch_size: min((i + 1) * IS_batch_size, len(gen_images_per_goal))]
225 |                 py = np.mean(cur_preds, axis=0)
226 |                 scores, entropys = [], []
227 |                 for j in range(cur_preds.shape[0]):
228 |                     pyx = cur_preds[j, :]
229 |                     scores.append(scipy.stats.entropy(pyx, py))
230 |                     entropys.append(scipy.stats.entropy(pyx))
231 |                 split_scores.append(np.exp(np.mean(scores)))
232 |                 split_entropys.append(np.exp(np.mean(entropys)))
233 | 
234 |             mean_entropys = np.mean(split_entropys)
235 |             std_entropys = np.std(split_entropys)
236 | 
237 |             mean_score = np.mean(split_scores)
238 |             std_score = np.std(split_scores)
239 |             print(f"{goal} goal: Entropy: {mean_entropys}, IS: {mean_score}")
240 |             total_IS.append(mean_score)
241 | 
242 |             # FID
243 |             real_acts, real_preds = get_model_outputs(model, real_imgs, batch_size=len(real_imgs))
244 |             mu_real = np.mean(real_acts, axis=0)
245 |             sigma_real = np.cov(real_acts, rowvar=False)
246 |             fid_score = frechet_distance(mu_real, sigma_real, mu_gen, sigma_gen)
247 |             print(f"{goal} goal: FID: {fid_score.item()}")
248 |             total_FID.append(fid_score.item())
249 |         print(f"Mean IS: {sum(total_IS) / goal_num}, Mean FID: {sum(total_FID) / goal_num}")
250 | 
251 |     elif args.metric == "attack_acc":
252 |         device = "cuda" if torch.cuda.is_available() else "cpu"
253 |         if args.task == "object":
254 |             label_path = r"./mini_100.txt"
255 |             with open(label_path, "r") as f:
256 |                 label_infos = f.readlines()
257 |                 label_infos = [label.lower().strip() for label in label_infos]
258 |         elif args.task == "style":
259 |             # load style label
260 |             label_infos = ["oil painting", "watercolor", "sketch", "animation", "photorealistic"]
261 |             # load style classification model
262 |             style_classify_model = ClassificationModel(model_id="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
263 |                                                        label_txt=label_infos, device=device, mode=args.task)
264 |         else:
265 |             raise ValueError("Task must be object or task")
266 | 
267 |         # load the prompt label
268 |         object_path = "./mini_100.txt"
269 |         with open(object_path, 'r') as f:
270 |             object_infos = f.readlines()
271 |             object_infos = [obj.lower().strip() for obj in object_infos]
272 |         # load object classification model
273 |         object_classify_model = ClassificationModel(model_id="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
274 |                                                     label_txt=object_infos, device=device, mode='object')
275 |         with open(args.attack_goal_path, "r") as f:
276 |             attack_goals = f.readlines()
277 |             attack_goals = [goal.split("/")[-1].lower().strip() for goal in attack_goals]
278 | 
279 |         if args.task == "style":
280 |             for goal in attack_goals:
281 |                 if goal not in style_classify_model.labels:
282 |                     style_classify_model.add_label_param([goal.lower()])
283 |             object_goal = None
284 |         else:
285 |             for goal in attack_goals:
286 |                 if goal not in object_classify_model.labels:
287 |                     object_classify_model.add_label_param([goal.lower()])
288 | 
289 |         target_goal_num = len(attack_goals)
290 |         image_num_per_goal = len(gen_imgs) // target_goal_num
291 |         image_num_per_prompt = 10
292 |         prompt_num = len(gen_imgs) // image_num_per_prompt // target_goal_num
293 |         split_num_per_prmpt = 5
294 |         batch_num_per_prompt = image_num_per_prompt // split_num_per_prmpt
295 | 
296 |         total_5_acc_style, total_10_acc_style, total_acc_style = 0, 0, 0
297 |         total_5_acc_obj, total_10_acc_obj, total_acc_obj = 0, 0, 0
298 |         for k, goal in enumerate(attack_goals):
299 |             images_per_goal = gen_imgs[k * image_num_per_goal: (k + 1) * image_num_per_goal]
300 |             labels_per_goal = gen_img_ori_labels[k * image_num_per_goal: (k + 1) * image_num_per_goal]
301 |             each_5_acc_style, each_10_acc_style, each_acc_style = 0, 0, 0
302 |             each_5_acc_obj, each_10_acc_obj, each_acc_obj = 0, 0, 0
303 |             for i in range(prompt_num):
304 |                 images_per_prompt = images_per_goal[i * image_num_per_prompt: (i + 1) * image_num_per_prompt]
305 |                 labels_per_prompt = labels_per_goal[i * image_num_per_prompt: (i + 1) * image_num_per_prompt]
306 |                 if args.task == "style":
307 |                     assert len(set(labels_per_prompt)) == 1, "one prompt contains multi-class images!! Attention!"
308 |                     object_label = object_classify_model.labels.index(labels_per_prompt[0].lower())
309 |                     style_label = style_classify_model.labels.index(goal.lower())
310 |                 else:
311 |                     object_label = object_classify_model.labels.index(goal.lower())
312 | 
313 |                 cur_5_acc_style, cur_10_acc_style, avg_acc_style = 0, 0, 0
314 |                 cur_5_acc_obj, cur_10_acc_obj, avg_acc_obj = 0, 0, 0
315 |                 for j in range(batch_num_per_prompt):
316 |                     images_per_batch = images_per_prompt[j * split_num_per_prmpt: (j + 1) * split_num_per_prmpt]
317 | 
318 |                     # style acc
319 |                     if args.task == "style":
320 |                         probs_style = style_classify_model(images_per_batch)
321 |                         acc_num = metric(probs_style, style_label)
322 |                         if (j + 1) * split_num_per_prmpt <= 5 and acc_num > 0:
323 |                             cur_5_acc_style = 1
324 |                         if acc_num > 0:
325 |                             cur_10_acc_style = 1
326 |                         avg_acc_style += acc_num
327 | 
328 |                     # obj acc
329 |                     probs_obj = object_classify_model(images_per_batch)
330 |                     acc_num = metric(probs_obj, object_label)
331 |                     avg_acc_obj += acc_num
332 | 
333 |                     if (j + 1) * split_num_per_prmpt <= 5 and acc_num > 0:
334 |                         cur_5_acc_obj = 1
335 |                     if acc_num > 0:
336 |                         cur_10_acc_obj = 1
337 | 
338 |                 each_5_acc_obj += cur_5_acc_obj
339 |                 each_10_acc_obj += cur_10_acc_obj
340 |                 each_acc_obj += avg_acc_obj
341 |                 print(f"{i}^th 5_acc_obj is: {cur_5_acc_obj}, 10_acc_obj is {cur_10_acc_obj}, "
342 |                       f"avg_acc_obj is {avg_acc_obj}")
343 |                 if args.task == "style":
344 |                     each_5_acc_style += cur_5_acc_style
345 |                     each_10_acc_style += cur_10_acc_style
346 |                     each_acc_style += avg_acc_style
347 |                     print(f"{i}^th 5_acc_style is: {cur_5_acc_style}, 10_acc_style is {cur_10_acc_style}, "
348 |                           f"avg_acc_style is {avg_acc_style}")
349 |             if args.task == "style":
350 |                 print("Each 5 acc style is {:.3f}%".format(each_5_acc_style * 100 / prompt_num))
351 |                 print("Each 10 acc style is {:.3f}%".format(each_10_acc_style * 100 / prompt_num))
352 |                 print("Each acc style is {:.3f}%".format(each_acc_style * 100 / image_num_per_prompt / prompt_num))
353 |             print("Each 5 acc obj is {:.3f}%".format(each_5_acc_obj * 100 / prompt_num))
354 |             print("Each 10 acc obj is {:.3f}%".format(each_10_acc_obj * 100 / prompt_num))
355 |             print("Each acc obj is {:.3f}%".format(each_acc_obj * 100 / image_num_per_prompt / prompt_num))
356 |             if args.task == "style":
357 |                 total_5_acc_style += each_5_acc_style * 100 / prompt_num
358 |                 total_10_acc_style += each_10_acc_style * 100 / prompt_num
359 |                 total_acc_style += each_acc_style * 100 / image_num_per_prompt / prompt_num
360 |             total_5_acc_obj += each_5_acc_obj * 100 / prompt_num
361 |             total_10_acc_obj += each_10_acc_obj * 100 / prompt_num
362 |             total_acc_obj += each_acc_obj * 100 / image_num_per_prompt / prompt_num
363 |         print("Final 5 acc style is {:.3f}%".format(total_5_acc_style / target_goal_num))
364 |         print("Final 10 acc style is {:.3f}%".format(total_10_acc_style / target_goal_num))
365 |         print("Total acc style is {:.3f}%".format(total_acc_style / target_goal_num))
366 |         print("Final 5 acc obj is {:.3f}%".format(total_5_acc_obj / target_goal_num))
367 |         print("Final 10 acc obj is {:.3f}%".format(total_10_acc_obj / target_goal_num))
368 |         print("Total acc obj is {:.3f}%".format(total_acc_obj / target_goal_num))
369 |     else:
370 |         raise ValueError("metric must be {image_quality, attack_acc}")


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Revealing Vulnerabilities in Stable Diffusion via Targeted Attacks
 2 | 
 3 | <img src=examples/framework.png  width="70%" height="40%">
 4 | 
 5 | ## Dependencies
 6 | 
 7 | - PyTorch == 2.0.1
 8 | - transformers == 4.23.1
 9 | - diffusers == 0.11.1
10 | - ftfy==6.1.1
11 | - accelerate=0.22.0
12 | - python==3.8.13
13 | 
14 | ## Usage
15 | 
16 | 1. Download the [word2id.pkl and wordvec.pkl](https://drive.google.com/drive/folders/1tNa91aGf5Y9D0usBN1M-XxbJ8hR4x_Jq?usp=sharing) for the synonym model, and put download files into the Word2Vec dir.
17 | 
18 | 2. A script is provided to perform targeted attacks for Stable Diffusion
19 | 
20 | ```sh
21 | # Traning for generating the adversarial prompts
22 | python run.py --config_path ./object_config.json  # Object attacks
23 | python run.py --config_path ./style_config.json  # Style attacks
24 | # Testing for evaluating the attack success rate
25 | python test_object_multi.py --config_path ./object_config.json  # Object attack 
26 | python test_style_multi.py --config_path ./style_config.json # Style attack
27 | # Testing for evaluating FID score of generated images
28 | python IQA.py --gen_img_path [the root of generated images] --task [object or style] --attack_goal_path [the path of referenced images] --metric image_quality 
29 | ```
30 | 
31 | ## Parameters
32 | 
33 | Config can be loaded from a JSON file. 
34 | 
35 | Config has the following parameters:
36 | 
37 | - `add_suffix_num`: the number of suffixes in the word addition perturbation strategy. The default is 5.
38 | - `replace_type`: a list for specifying the word types in the word substitution strategy. The default is ['all'] that represent replace all words except the noun. Optional: ["verb", "adj", "adv", "prep"]
39 | - `synonym_num`: The forbidden number of synonyms. The default is 10.
40 | - `iter`: the total number of iterations. The default is 500.
41 | - `lr`: the learning weight for the [optimizer](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html). The default is 0.1
42 | - `weight_decay`: the weight decay for the optimizer.
43 | - `loss_weight`: The weight of MSE loss in style attacks.
44 | - `print_step`: The number of steps to print a line giving current status
45 | - `batch_size`: number of referenced images used for each iteration.
46 | - `clip_model`: the name of the CLiP model for use with . `"laion/CLIP-ViT-H-14-laion2B-s32B-b79K"` is the model used in SD 2.1.
47 | - `prompt_path`: The path of clean prompt file.
48 | - `task`: The targeted attack task. Optional: `"object"`or `"style"`
49 | - `forbidden_words`: A txt file for representing the forbidden words for each target goal.
50 | - `target_path`: The file path of referenced images.
51 | - `output_dir`: The path for saving the learned adversarial prompts.
52 | 
53 | 
54 | 
55 | ## Adversarial Attack Dataset
56 | 
57 | We public our adversarial attack dataset that is used to achieve object attacks on Stable Diffusion. The dataset is available at [[Link]](https://github.com/datar001/Attack-Pattern-on-T2I/tree/main/Adversarial_Attack_Dataset).
58 | 
59 | ## Citation
60 | 
61 | If you find the repo useful, please consider citing.
62 | 
63 | ```
64 | @article{zhang2024revealing,
65 |   title={Revealing Vulnerabilities in Stable Diffusion via Targeted Attacks},
66 |   author={Zhang, Chenyu and Wang, Lanjun and Liu, Anan},
67 |   journal={arXiv preprint arXiv:2401.08725},
68 |   year={2024}
69 | }
70 | ```
71 | 
72 | 


--------------------------------------------------------------------------------
/Word2Vec/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/Word2Vec/.gitkeep


--------------------------------------------------------------------------------
/classification.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from transformers import CLIPModel, AutoTokenizer, AutoProcessor
 4 | import os, json
 5 | 
 6 | 
 7 | class ClassificationModel(nn.Module):
 8 |     def __init__(self, model_id, label_txt=None, device='cpu', mode='style'):
 9 |         super(ClassificationModel, self).__init__()
10 |         self.device = device
11 |         self.model = CLIPModel.from_pretrained(model_id)
12 |         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
13 |         self.image_processor = AutoProcessor.from_pretrained(model_id)
14 |         self.Softmax = nn.Softmax(dim=-1)
15 |         self.mode = mode
16 |         self.init_fc_param(label_txt)
17 |         self.fix_backbobe()
18 |         self.to(self.device)
19 | 
20 |     def init_fc_param(self, label_path=None):
21 |         if label_path is None:
22 |             raise ValueError('Please input the path of ImageNet1K annotations')
23 |         if type(label_path) == str:
24 |             with open(label_path, 'r') as f:
25 |                 infos = f.readlines()
26 |         else:
27 |             infos = label_path
28 |         prompts = []
29 |         labels = []
30 |         for cla in infos:
31 |             labels.append(cla.lower().strip())
32 |             if self.mode == "style":
33 |                 prompts.append(f"a photo with the {cla.lower().strip()} style")  #
34 |             elif self.mode == 'object':
35 |                 prompts.append(f"a photo of {cla.lower().strip()}")
36 |             else:
37 |                 raise ValueError('Please supply the classification mode')
38 |         # pdb.set_trace()
39 |         with torch.no_grad():
40 |             inputs = self.tokenizer(prompts, padding=True, return_tensors="pt")
41 |             text_embeds = self.model.get_text_features(**inputs)
42 |             text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
43 |         self.text_embeds = text_embeds.to(self.device)
44 |         self.labels = labels
45 | 
46 |     def add_label_param(self, label: list):
47 |         if self.mode == 'style':
48 |             inputs = [f"a photo with the {ll} style" for ll in label]
49 |         else:
50 |             inputs = [f"a photo of {ll}" for ll in label]
51 |         with torch.no_grad():
52 |             inputs = self.tokenizer(inputs, padding=True, return_tensors="pt")
53 |             for k, v in inputs.items():
54 |                 inputs[k] = v.to(self.device)
55 |             text_embeds = self.model.get_text_features(**inputs)
56 |             text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
57 |             text_embeds = text_embeds.to(self.device)
58 |         self.text_embeds = torch.cat((self.text_embeds, text_embeds), dim=0)
59 |         self.labels.extend(ll.lower().strip() for ll in label)
60 | 
61 |     def get_scores(self, image_embeds):
62 |         logit_scale = self.model.logit_scale.exp()
63 |         logit_scale = logit_scale.to(self.device)
64 |         logits_per_image = torch.matmul(image_embeds, self.text_embeds.t()) * logit_scale
65 |         return logits_per_image
66 | 
67 |     def forward(self, image):
68 |         with torch.no_grad():
69 |             inputs = self.image_processor(images=image, return_tensors="pt")
70 |             # pdb.set_trace()
71 |             for key, value in inputs.items():
72 |                 if torch.is_tensor(value):
73 |                     inputs[key] = value.to(self.device)
74 |             image_embeds = self.model.get_image_features(**inputs)
75 |             # normalized features
76 |             image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
77 |             scores = self.get_scores(image_embeds)
78 |             probs = self.Softmax(scores)
79 |         return probs
80 | 
81 |     def fix_backbobe(self):
82 |         for p in self.model.parameters():
83 |             p.requires_grad = False
84 | 
85 |     def get_params(self):
86 |         params = []
87 |         params.append({'params': self.model.parameters()})
88 |         return params
89 | 
90 |     def to(self, device):
91 |         self.model = self.model.to(device)


--------------------------------------------------------------------------------
/examples/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/examples/framework.png


--------------------------------------------------------------------------------
/examples/ptp_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import numpy as np
 16 | import torch
 17 | from PIL import Image, ImageDraw, ImageFont
 18 | import cv2
 19 | from typing import Optional, Union, Tuple, List, Callable, Dict
 20 | from IPython.display import display
 21 | from tqdm.notebook import tqdm
 22 | import random
 23 | 
 24 | 
 25 | def text_under_image(image: np.ndarray, text: str, text_color: Tuple[int, int, int] = (0, 0, 0)):
 26 |     h, w, c = image.shape
 27 |     offset = int(h * .2)
 28 |     img = np.ones((h + offset, w, c), dtype=np.uint8) * 255
 29 |     font = cv2.FONT_HERSHEY_SIMPLEX
 30 |     # font = ImageFont.truetype("/usr/share/fonts/truetype/noto/NotoMono-Regular.ttf", font_size)
 31 |     img[:h] = image
 32 |     textsize = cv2.getTextSize(text, font, 1, 2)[0]
 33 |     text_x, text_y = (w - textsize[0]) // 2, h + offset - textsize[1] // 2
 34 |     cv2.putText(img, text, (text_x, text_y ), font, 1, text_color, 2)
 35 |     return img
 36 | 
 37 | def set_random_seed(seed=0):
 38 |     torch.manual_seed(seed + 0)
 39 |     torch.cuda.manual_seed(seed + 1)
 40 |     torch.cuda.manual_seed_all(seed + 2)
 41 |     np.random.seed(seed + 3)
 42 |     torch.cuda.manual_seed_all(seed + 4)
 43 |     random.seed(seed + 5)
 44 | 
 45 | def view_images(images, num_rows=1, offset_ratio=0.02):
 46 |     if type(images) is list:
 47 |         num_empty = len(images) % num_rows
 48 |     elif images.ndim == 4:
 49 |         num_empty = images.shape[0] % num_rows
 50 |     else:
 51 |         images = [images]
 52 |         num_empty = 0
 53 | 
 54 |     empty_images = np.ones(images[0].shape, dtype=np.uint8) * 255
 55 |     images = [image.astype(np.uint8) for image in images] + [empty_images] * num_empty
 56 |     num_items = len(images)
 57 | 
 58 |     h, w, c = images[0].shape
 59 |     offset = int(h * offset_ratio)
 60 |     num_cols = num_items // num_rows
 61 |     image_ = np.ones((h * num_rows + offset * (num_rows - 1),
 62 |                       w * num_cols + offset * (num_cols - 1), 3), dtype=np.uint8) * 255
 63 |     for i in range(num_rows):
 64 |         for j in range(num_cols):
 65 |             image_[i * (h + offset): i * (h + offset) + h:, j * (w + offset): j * (w + offset) + w] = images[
 66 |                 i * num_cols + j]
 67 | 
 68 |     pil_img = Image.fromarray(image_)
 69 |     display(pil_img)
 70 | 
 71 | 
 72 | def diffusion_step(model, controller, latents, context, t, guidance_scale, low_resource=False):
 73 |     if low_resource:
 74 |         noise_pred_uncond = model.unet(latents, t, encoder_hidden_states=context[0])["sample"]
 75 |         noise_prediction_text = model.unet(latents, t, encoder_hidden_states=context[1])["sample"]
 76 |     else:
 77 |         latents_input = torch.cat([latents] * 2)
 78 |         noise_pred = model.unet(latents_input, t, encoder_hidden_states=context)["sample"]
 79 |         noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
 80 |     noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
 81 |     latents = model.scheduler.step(noise_pred, t, latents)["prev_sample"]
 82 |     latents = controller.step_callback(latents)
 83 |     return latents
 84 | 
 85 | 
 86 | def latent2image(vae, latents):
 87 |     latents = 1 / 0.18215 * latents
 88 |     image = vae.decode(latents)['sample']
 89 |     image = (image / 2 + 0.5).clamp(0, 1)
 90 |     image = image.cpu().permute(0, 2, 3, 1).numpy()
 91 |     image = (image * 255).astype(np.uint8)
 92 |     return image
 93 | 
 94 | 
 95 | def init_latent(latent, model, height, width, generator, batch_size):
 96 |     if latent is None:
 97 |         latent = torch.randn(
 98 |             (1, model.unet.in_channels, height // 8, width // 8),
 99 |             generator=generator,
100 |         )
101 |     latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)
102 |     return latent, latents
103 | 
104 | 
105 | @torch.no_grad()
106 | def text2image_ldm(
107 |     model,
108 |     prompt:  List[str],
109 |     controller,
110 |     num_inference_steps: int = 50,
111 |     guidance_scale: Optional[float] = 7.,
112 |     generator: Optional[torch.Generator] = None,
113 |     latent: Optional[torch.FloatTensor] = None,
114 | ):
115 |     register_attention_control(model, controller)
116 |     height = width = 256
117 |     batch_size = len(prompt)
118 |     
119 |     uncond_input = model.tokenizer([""] * batch_size, padding="max_length", max_length=77, return_tensors="pt")
120 |     uncond_embeddings = model.bert(uncond_input.input_ids.to(model.device))[0]
121 |     
122 |     text_input = model.tokenizer(prompt, padding="max_length", max_length=77, return_tensors="pt")
123 |     text_embeddings = model.bert(text_input.input_ids.to(model.device))[0]
124 |     latent, latents = init_latent(latent, model, height, width, generator, batch_size)
125 |     context = torch.cat([uncond_embeddings, text_embeddings])
126 |     
127 |     model.scheduler.set_timesteps(num_inference_steps)
128 |     for t in tqdm(model.scheduler.timesteps):
129 |         latents = diffusion_step(model, controller, latents, context, t, guidance_scale)
130 |     
131 |     image = latent2image(model.vqvae, latents)
132 |    
133 |     return image, latent
134 | 
135 | 
136 | @torch.no_grad()
137 | def text2image_ldm_stable(
138 |     model,
139 |     prompt: List[str],
140 |     controller,
141 |     num_inference_steps: int = 50,
142 |     num_images_per_prompt: int=1,
143 |     guidance_scale: float = 7.5,
144 |     delete_id=None,
145 |     gen_intermediate=False,
146 |     generator: Optional[torch.Generator] = None,
147 |     latent: Optional[torch.FloatTensor] = None,
148 |     low_resource: bool = False,
149 | ):
150 |     register_attention_control(model, controller)
151 |     height = width = 512
152 |     if delete_id is not None:
153 |         images = model.gen_images_by_selected_token_id(
154 |             prompt,
155 |             delete_id,
156 |             num_images_per_prompt=num_images_per_prompt,
157 |             guidance_scale=guidance_scale,
158 |             num_inference_steps=num_inference_steps,
159 |             height=height,
160 |             width=width,
161 |         ).images
162 |     elif gen_intermediate:
163 |         images = model.forward_intermediate(
164 |             prompt,
165 |             delete_id,
166 |             num_images_per_prompt=num_images_per_prompt,
167 |             guidance_scale=guidance_scale,
168 |             num_inference_steps=num_inference_steps,
169 |             height=height,
170 |             width=width,
171 |         ).images
172 |     else:
173 |         images = model(
174 |             prompt,
175 |             num_images_per_prompt=num_images_per_prompt,
176 |             guidance_scale=guidance_scale,
177 |             num_inference_steps=num_inference_steps,
178 |             height=height,
179 |             width=width,
180 |         ).images
181 |   
182 |     return images, None # latent
183 | 
184 | 
185 | def register_attention_control(model, controller):
186 |     def ca_forward(self, place_in_unet):
187 |         to_out = self.to_out
188 |         if type(to_out) is torch.nn.modules.container.ModuleList:
189 |             to_out = self.to_out[0]
190 |         else:
191 |             to_out = self.to_out
192 | 
193 |         def forward(x, encoder_hidden_states=None, attention_mask=None):
194 |             # pdb.set_trace()
195 |             batch_size, sequence_length, dim = x.shape
196 |             h = self.heads
197 |             q = self.to_q(x)
198 |             is_cross = encoder_hidden_states is not None
199 |             encoder_hidden_states = encoder_hidden_states if is_cross else x
200 |             k = self.to_k(encoder_hidden_states)
201 |             v = self.to_v(encoder_hidden_states)
202 |             q = self.reshape_heads_to_batch_dim(q)
203 |             k = self.reshape_heads_to_batch_dim(k)
204 |             v = self.reshape_heads_to_batch_dim(v)
205 | 
206 |             sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
207 | 
208 |             if attention_mask is not None:
209 |                 attention_mask = attention_mask.reshape(batch_size, -1)
210 |                 max_neg_value = -torch.finfo(sim.dtype).max
211 |                 attention_mask = attention_mask[:, None, :].repeat(h, 1, 1)
212 |                 sim.masked_fill_(~attention_mask, max_neg_value)
213 | 
214 |             # attention, what we cannot get enough of
215 |             attn = sim.softmax(dim=-1)
216 |             attn = controller(attn, is_cross, place_in_unet)
217 |             out = torch.einsum("b i j, b j d -> b i d", attn, v)
218 |             out = self.reshape_batch_dim_to_heads(out)
219 |             return to_out(out)
220 | 
221 |         return forward
222 | 
223 |     # def ca_forward(self, place_in_unet):
224 |     #     def attn(query, key, value, place_in_unet, is_cross=False, attention_mask=None):
225 |     #         if self.upcast_attention:
226 |     #             query = query.float()
227 |     #             key = key.float()
228 |     #
229 |     #         attention_scores = torch.baddbmm(
230 |     #             torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
231 |     #             query,
232 |     #             key.transpose(-1, -2),
233 |     #             beta=0,
234 |     #             alpha=self.scale,
235 |     #         )
236 |     #
237 |     #         if attention_mask is not None:
238 |     #             attention_scores = attention_scores + attention_mask
239 |     #
240 |     #         if self.upcast_softmax:
241 |     #             attention_scores = attention_scores.float()
242 |     #
243 |     #         attention_probs = attention_scores.softmax(dim=-1)
244 |     #         attention_probs = controller(attention_probs, is_cross, place_in_unet)
245 |     #
246 |     #         # cast back to the original dtype
247 |     #         attention_probs = attention_probs.to(value.dtype)
248 |     #
249 |     #         # compute attention output
250 |     #         hidden_states = torch.bmm(attention_probs, value)
251 |     #
252 |     #         # reshape hidden_states
253 |     #         hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
254 |     #         return hidden_states
255 |     #
256 |     #     def forward(hidden_states, encoder_hidden_states=None, attention_mask=None):
257 |     #         batch_size, sequence_length, _ = hidden_states.shape
258 |     #
259 |     #         encoder_hidden_states = encoder_hidden_states
260 |     #
261 |     #         if self.group_norm is not None:
262 |     #             hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
263 |     #
264 |     #         query = self.to_q(hidden_states)
265 |     #         dim = query.shape[-1]
266 |     #         query = self.reshape_heads_to_batch_dim(query)
267 |     #
268 |     #         if self.added_kv_proj_dim is not None:
269 |     #             key = self.to_k(hidden_states)
270 |     #             value = self.to_v(hidden_states)
271 |     #             is_cross = False
272 |     #             encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)
273 |     #             encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)
274 |     #
275 |     #             key = self.reshape_heads_to_batch_dim(key)
276 |     #             value = self.reshape_heads_to_batch_dim(value)
277 |     #             encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj)
278 |     #             encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj)
279 |     #
280 |     #             key = torch.concat([encoder_hidden_states_key_proj, key], dim=1)
281 |     #             value = torch.concat([encoder_hidden_states_value_proj, value], dim=1)
282 |     #         else:
283 |     #             is_cross = True if encoder_hidden_states is not None else False
284 |     #             encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
285 |     #             key = self.to_k(encoder_hidden_states)
286 |     #             value = self.to_v(encoder_hidden_states)
287 |     #
288 |     #             key = self.reshape_heads_to_batch_dim(key)
289 |     #             value = self.reshape_heads_to_batch_dim(value)
290 |     #
291 |     #         if attention_mask is not None:
292 |     #             if attention_mask.shape[-1] != query.shape[1]:
293 |     #                 target_length = query.shape[1]
294 |     #                 attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
295 |     #                 attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
296 |     #
297 |     #         # attention, what we cannot get enough of
298 |     #         if self._use_memory_efficient_attention_xformers:
299 |     #             hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
300 |     #             # Some versions of xformers return output in fp32, cast it back to the dtype of the input
301 |     #             hidden_states = hidden_states.to(query.dtype)
302 |     #         else:
303 |     #             if self._slice_size is None or query.shape[0] // self._slice_size == 1:
304 |     #                 if is_cross:
305 |     #                     hidden_states = attn(query, key, value, place_in_unet, is_cross, attention_mask)
306 |     #                 else:
307 |     #                     hidden_states = self._attention(query, key, value, attention_mask)
308 |     #             else:
309 |     #                 hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
310 |     #
311 |     #         # linear proj
312 |     #         hidden_states = self.to_out[0](hidden_states)
313 |     #
314 |     #         # dropout
315 |     #         hidden_states = self.to_out[1](hidden_states)
316 |     #         return hidden_states
317 |     #     return forward
318 | 
319 | 
320 |     class DummyController:
321 | 
322 |         def __call__(self, *args):
323 |             return args[0]
324 | 
325 |         def __init__(self):
326 |             self.num_att_layers = 0
327 | 
328 |     if controller is None:
329 |         controller = DummyController()
330 | 
331 |     def register_recr(net_, count, place_in_unet):
332 |         if net_.__class__.__name__ == 'CrossAttention':
333 |             net_.forward = ca_forward(net_, place_in_unet)
334 |             return count + 1
335 |         elif hasattr(net_, 'children'):
336 |             for net__ in net_.children():
337 |                 count = register_recr(net__, count, place_in_unet)
338 |         return count
339 | 
340 |     cross_att_count = 0
341 |     sub_nets = model.unet.named_children()
342 | 
343 |     for net in sub_nets:
344 |         if "down" in net[0]:
345 |             cross_att_count += register_recr(net[1], 0, "down")
346 |         elif "up" in net[0]:
347 |             cross_att_count += register_recr(net[1], 0, "up")
348 |         elif "mid" in net[0]:
349 |             cross_att_count += register_recr(net[1], 0, "mid")
350 | 
351 |     controller.num_att_layers = cross_att_count
352 | 
353 |     
354 | def get_word_inds(text: str, word_place: int, tokenizer):
355 |     split_text = text.split(" ")
356 |     if type(word_place) is str:
357 |         word_place = [i for i, word in enumerate(split_text) if word_place == word]
358 |     elif type(word_place) is int:
359 |         word_place = [word_place]
360 |     out = []
361 |     if len(word_place) > 0:
362 |         words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
363 |         cur_len, ptr = 0, 0
364 | 
365 |         for i in range(len(words_encode)):
366 |             cur_len += len(words_encode[i])
367 |             if ptr in word_place:
368 |                 out.append(i + 1)
369 |             if cur_len >= len(split_text[ptr]):
370 |                 ptr += 1
371 |                 cur_len = 0
372 |     return np.array(out)
373 | 
374 | 
375 | def update_alpha_time_word(alpha, bounds: Union[float, Tuple[float, float]], prompt_ind: int,
376 |                            word_inds: Optional[torch.Tensor]=None):
377 |     if type(bounds) is float:
378 |         bounds = 0, bounds
379 |     start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0])
380 |     if word_inds is None:
381 |         word_inds = torch.arange(alpha.shape[2])
382 |     alpha[: start, prompt_ind, word_inds] = 0
383 |     alpha[start: end, prompt_ind, word_inds] = 1
384 |     alpha[end:, prompt_ind, word_inds] = 0
385 |     return alpha
386 | 
387 | 
388 | def get_time_words_attention_alpha(prompts, num_steps,
389 |                                    cross_replace_steps: Union[float, Dict[str, Tuple[float, float]]],
390 |                                    tokenizer, max_num_words=77):
391 |     if type(cross_replace_steps) is not dict:
392 |         cross_replace_steps = {"default_": cross_replace_steps}
393 |     if "default_" not in cross_replace_steps:
394 |         cross_replace_steps["default_"] = (0., 1.)
395 |     alpha_time_words = torch.zeros(num_steps + 1, len(prompts) - 1, max_num_words)
396 |     for i in range(len(prompts) - 1):
397 |         alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"],
398 |                                                   i)
399 |     for key, item in cross_replace_steps.items():
400 |         if key != "default_":
401 |              inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))]
402 |              for i, ind in enumerate(inds):
403 |                  if len(ind) > 0:
404 |                     alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind)
405 |     alpha_time_words = alpha_time_words.reshape(num_steps + 1, len(prompts) - 1, 1, 1, max_num_words)
406 |     return alpha_time_words
407 | 


--------------------------------------------------------------------------------
/examples/seq_aligner.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import torch
 15 | import numpy as np
 16 | 
 17 | 
 18 | class ScoreParams:
 19 | 
 20 |     def __init__(self, gap, match, mismatch):
 21 |         self.gap = gap
 22 |         self.match = match
 23 |         self.mismatch = mismatch
 24 | 
 25 |     def mis_match_char(self, x, y):
 26 |         if x != y:
 27 |             return self.mismatch
 28 |         else:
 29 |             return self.match
 30 |         
 31 |     
 32 | def get_matrix(size_x, size_y, gap):
 33 |     matrix = []
 34 |     for i in range(len(size_x) + 1):
 35 |         sub_matrix = []
 36 |         for j in range(len(size_y) + 1):
 37 |             sub_matrix.append(0)
 38 |         matrix.append(sub_matrix)
 39 |     for j in range(1, len(size_y) + 1):
 40 |         matrix[0][j] = j*gap
 41 |     for i in range(1, len(size_x) + 1):
 42 |         matrix[i][0] = i*gap
 43 |     return matrix
 44 | 
 45 | 
 46 | def get_matrix(size_x, size_y, gap):
 47 |     matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32)
 48 |     matrix[0, 1:] = (np.arange(size_y) + 1) * gap
 49 |     matrix[1:, 0] = (np.arange(size_x) + 1) * gap
 50 |     return matrix
 51 | 
 52 | 
 53 | def get_traceback_matrix(size_x, size_y):
 54 |     matrix = np.zeros((size_x + 1, size_y +1), dtype=np.int32)
 55 |     matrix[0, 1:] = 1
 56 |     matrix[1:, 0] = 2
 57 |     matrix[0, 0] = 4
 58 |     return matrix
 59 | 
 60 | 
 61 | def global_align(x, y, score):
 62 |     matrix = get_matrix(len(x), len(y), score.gap)
 63 |     trace_back = get_traceback_matrix(len(x), len(y))
 64 |     for i in range(1, len(x) + 1):
 65 |         for j in range(1, len(y) + 1):
 66 |             left = matrix[i, j - 1] + score.gap
 67 |             up = matrix[i - 1, j] + score.gap
 68 |             diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], y[j - 1])
 69 |             matrix[i, j] = max(left, up, diag)
 70 |             if matrix[i, j] == left:
 71 |                 trace_back[i, j] = 1
 72 |             elif matrix[i, j] == up:
 73 |                 trace_back[i, j] = 2
 74 |             else:
 75 |                 trace_back[i, j] = 3
 76 |     return matrix, trace_back
 77 | 
 78 | 
 79 | def get_aligned_sequences(x, y, trace_back):
 80 |     x_seq = []
 81 |     y_seq = []
 82 |     i = len(x)
 83 |     j = len(y)
 84 |     mapper_y_to_x = []
 85 |     while i > 0 or j > 0:
 86 |         if trace_back[i, j] == 3:
 87 |             x_seq.append(x[i-1])
 88 |             y_seq.append(y[j-1])
 89 |             i = i-1
 90 |             j = j-1
 91 |             mapper_y_to_x.append((j, i))
 92 |         elif trace_back[i][j] == 1:
 93 |             x_seq.append('-')
 94 |             y_seq.append(y[j-1])
 95 |             j = j-1
 96 |             mapper_y_to_x.append((j, -1))
 97 |         elif trace_back[i][j] == 2:
 98 |             x_seq.append(x[i-1])
 99 |             y_seq.append('-')
100 |             i = i-1
101 |         elif trace_back[i][j] == 4:
102 |             break
103 |     mapper_y_to_x.reverse()
104 |     return x_seq, y_seq, torch.tensor(mapper_y_to_x, dtype=torch.int64)
105 | 
106 | 
107 | def get_mapper(x: str, y: str, tokenizer, max_len=77):
108 |     x_seq = tokenizer.encode(x)
109 |     y_seq = tokenizer.encode(y)
110 |     score = ScoreParams(0, 1, -1)
111 |     matrix, trace_back = global_align(x_seq, y_seq, score)
112 |     mapper_base = get_aligned_sequences(x_seq, y_seq, trace_back)[-1]
113 |     alphas = torch.ones(max_len)
114 |     alphas[: mapper_base.shape[0]] = mapper_base[:, 1].ne(-1).float()
115 |     mapper = torch.zeros(max_len, dtype=torch.int64)
116 |     mapper[:mapper_base.shape[0]] = mapper_base[:, 1]
117 |     mapper[mapper_base.shape[0]:] = len(y_seq) + torch.arange(max_len - len(y_seq))
118 |     return mapper, alphas
119 | 
120 | 
121 | def get_refinement_mapper(prompts, tokenizer, max_len=77):
122 |     x_seq = prompts[0]
123 |     mappers, alphas = [], []
124 |     for i in range(1, len(prompts)):
125 |         mapper, alpha = get_mapper(x_seq, prompts[i], tokenizer, max_len)
126 |         mappers.append(mapper)
127 |         alphas.append(alpha)
128 |     return torch.stack(mappers), torch.stack(alphas)
129 | 
130 | 
131 | def get_word_inds(text: str, word_place: int, tokenizer):
132 |     split_text = text.split(" ")
133 |     if type(word_place) is str:
134 |         word_place = [i for i, word in enumerate(split_text) if word_place == word]
135 |     elif type(word_place) is int:
136 |         word_place = [word_place]
137 |     out = []
138 |     if len(word_place) > 0:
139 |         words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
140 |         cur_len, ptr = 0, 0
141 | 
142 |         for i in range(len(words_encode)):
143 |             cur_len += len(words_encode[i])
144 |             if ptr in word_place:
145 |                 out.append(i + 1)
146 |             if cur_len >= len(split_text[ptr]):
147 |                 ptr += 1
148 |                 cur_len = 0
149 |     return np.array(out)
150 | 
151 | 
152 | def get_replacement_mapper_(x: str, y: str, tokenizer, max_len=77):
153 |     words_x = x.split(' ')
154 |     words_y = y.split(' ')
155 |     if len(words_x) != len(words_y):
156 |         raise ValueError(f"attention replacement edit can only be applied on prompts with the same length"
157 |                          f" but prompt A has {len(words_x)} words and prompt B has {len(words_y)} words.")
158 |     inds_replace = [i for i in range(len(words_y)) if words_y[i] != words_x[i]]
159 |     inds_source = [get_word_inds(x, i, tokenizer) for i in inds_replace]
160 |     inds_target = [get_word_inds(y, i, tokenizer) for i in inds_replace]
161 |     mapper = np.zeros((max_len, max_len))
162 |     i = j = 0
163 |     cur_inds = 0
164 |     while i < max_len and j < max_len:
165 |         if cur_inds < len(inds_source) and inds_source[cur_inds][0] == i:
166 |             inds_source_, inds_target_ = inds_source[cur_inds], inds_target[cur_inds]
167 |             if len(inds_source_) == len(inds_target_):
168 |                 mapper[inds_source_, inds_target_] = 1
169 |             else:
170 |                 ratio = 1 / len(inds_target_)
171 |                 for i_t in inds_target_:
172 |                     mapper[inds_source_, i_t] = ratio
173 |             cur_inds += 1
174 |             i += len(inds_source_)
175 |             j += len(inds_target_)
176 |         elif cur_inds < len(inds_source):
177 |             mapper[i, j] = 1
178 |             i += 1
179 |             j += 1
180 |         else:
181 |             mapper[j, j] = 1
182 |             i += 1
183 |             j += 1
184 | 
185 |     return torch.from_numpy(mapper).float()
186 | 
187 | 
188 | 
189 | def get_replacement_mapper(prompts, tokenizer, max_len=77):
190 |     x_seq = prompts[0]
191 |     mappers = []
192 |     for i in range(1, len(prompts)):
193 |         mapper = get_replacement_mapper_(x_seq, prompts[i], tokenizer, max_len)
194 |         mappers.append(mapper)
195 |     return torch.stack(mappers)
196 | 
197 | 


--------------------------------------------------------------------------------
/get_object_attention_mask.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from transformers import CLIPModel, AutoTokenizer, AutoProcessor
 4 | import numpy as np
 5 | import pdb
 6 | 
 7 | class OSModel(nn.Module):
 8 |     def __init__(self, model, clip_id_or_path, object_path, device):
 9 |         super(OSModel, self).__init__()
10 |         with open(object_path, "r") as f:
11 |             objects = f.readlines()
12 |         self.objects = [obj.strip() for obj in objects]
13 |         self.object_num = len(self.objects)
14 | 
15 |         self.model = model
16 |         self.tokenizer = AutoTokenizer.from_pretrained(clip_id_or_path)
17 |         self.model.to(device)
18 | 
19 |         self.device = device
20 | 
21 |     def replace_object(self, ori_prompt, object_id_or_name, ref_num=10):
22 |         if type(object_id_or_name) == int:
23 |             ori_object = self.objects[object_id_or_name]
24 |         else:
25 |             ori_object = object_id_or_name
26 | 
27 |         assert ref_num < self.object_num
28 |         selected_index = np.random.choice(np.arange(self.object_num), ref_num, replace=False)
29 |         ref_prompts = []
30 |         for id in selected_index:
31 |             ref_prompts.append(ori_prompt.replace(ori_object, self.objects[id]))
32 |         return ref_prompts
33 | 
34 |     def get_prompt_feature(self, prompts):
35 |         with torch.no_grad():
36 |             inputs = self.tokenizer(prompts, padding=True, return_tensors="pt")
37 |             text_embeds = self.model.get_text_features(inputs.input_ids.to(self.device))
38 |             text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
39 |             # text_embeds = text_embeds.to(self.device)
40 |         return text_embeds
41 | 
42 |     def forward(self, ori_prompt, object_id_or_name, ref_num=10, thres=None):
43 |         ref_prompts = self.replace_object(ori_prompt, object_id_or_name, ref_num)
44 |         ref_features = self.get_prompt_feature(ref_prompts)
45 |         ori_feature = self.get_prompt_feature(ori_prompt)
46 |         
47 |         diff_features = ref_features - ori_feature.repeat(ref_num, 1)
48 |         diff_sign = torch.sign(diff_features).sum(0)
49 |         if thres is None:
50 |             thres = ref_num -1
51 |         mask = torch.zeros_like(diff_sign).to(self.device)
52 |         mask[abs(diff_sign) <= thres] = 1
53 |         object_ratio = 1 - mask.sum() / diff_sign.size(-1)
54 |         return mask.unsqueeze(0), object_ratio
55 |     
56 | if __name__ == "__main__":
57 |     clip_id = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
58 |     # load CLIP model
59 |     device = "cuda" if torch.cuda.is_available() else "cpu"
60 |     model = CLIPModel.from_pretrained(clip_id).to(device)
61 | 
62 |     object_style_model = OSModel(model, clip_id, "./mini_100.txt", device)
63 |     object_mask, object_ratio = object_style_model.forward("a photo of orange", "orange")
64 |     print("ratio of mask is: {}".format(object_ratio))


--------------------------------------------------------------------------------
/mini_100.txt:
--------------------------------------------------------------------------------
  1 | dalmatian
  2 | nematode
  3 | unicycle
  4 | reel
  5 | upright
  6 | Arctic fox
  7 | photocopier
  8 | aircraft carrier
  9 | combination lock
 10 | orange
 11 | cliff
 12 | three-toed sloth
 13 | theater curtain
 14 | horizontal bar
 15 | tile roof
 16 | dishrag
 17 | snorkel
 18 | cocktail shaker
 19 | rhinoceros beetle
 20 | trifle
 21 | prayer rug
 22 | dugong
 23 | school bus
 24 | slot
 25 | organ
 26 | oboe
 27 | bookshop
 28 | hourglass
 29 | boxer
 30 | ear
 31 | lipstick
 32 | file
 33 | electric guitar
 34 | harvestman
 35 | coral reef
 36 | Ibizan hound
 37 | African hunting dog
 38 | spider web
 39 | missile
 40 | holocanthus tricolor
 41 | cuirass
 42 | scoreboard
 43 | hotdog
 44 | beer bottle
 45 | Walker hound
 46 | gong
 47 | triceratops
 48 | house finch
 49 | clog
 50 | mixing bowl
 51 | toucan
 52 | carton
 53 | bolete
 54 | Tibetan mastiff
 55 | Gordon setter
 56 | garbage truck
 57 | yawl
 58 | robin
 59 | vase
 60 | barrel
 61 | street sign
 62 | goose
 63 | solar dish
 64 | malamute
 65 | consomme
 66 | Saluki
 67 | iPod
 68 | parallel bars
 69 | miniature poodle
 70 | poncho
 71 | ant
 72 | meerkat
 73 | ladybug
 74 | French bulldog
 75 | miniskirt
 76 | king crab
 77 | dome
 78 | golden retriever
 79 | ashcan
 80 | green mamba
 81 | hair slide
 82 | komondor
 83 | cannon
 84 | tank
 85 | fire screen
 86 | carousel
 87 | crate
 88 | frying pan
 89 | stage
 90 | holster
 91 | tobacco shop
 92 | black-footed ferret
 93 | white wolf
 94 | worm fence
 95 | jellyfish
 96 | wok
 97 | Newfoundland
 98 | pencil box
 99 | lion
100 | catamaran
101 | 


--------------------------------------------------------------------------------
/modified_clip.py:
--------------------------------------------------------------------------------
 1 | from transformers import CLIPModel, CLIPConfig
 2 | from transformers.modeling_outputs import BaseModelOutputWithPooling
 3 | import torch.nn
 4 | import torch
 5 | import pdb
 6 | 
 7 | class Modified_ClipModel(CLIPModel):
 8 |     def __init__(self, config: CLIPConfig):
 9 |         super(Modified_ClipModel, self).__init__(config)
10 | 
11 |     def encode_text_feature(self, hidden_states, input_ids, attention_mask=None):
12 |         output_attentions = self.text_model.config.output_attentions
13 |         output_hidden_states = (
14 |             self.text_model.config.output_hidden_states
15 |         )
16 |         return_dict = self.text_model.config.use_return_dict
17 | 
18 |         # hidden_states = self.text_encoder.text_model.embeddings(inputs_embeds=prompt_embeddings)
19 |         hidden_states = hidden_states[:, :input_ids.argmax(-1)+1]
20 |         input_ids = input_ids[:, :input_ids.argmax(-1)+1]
21 |         position_ids = self.text_model.embeddings.position_ids[:, :input_ids.argmax(-1)+1]
22 |         hidden_states = hidden_states + self.text_model.embeddings.position_embedding(position_ids)
23 | 
24 |         bsz, seq_len = input_ids.size()
25 |         # CLIP's text model uses causal mask, prepare it here.
26 |         # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
27 |         causal_attention_mask = self.text_model._build_causal_attention_mask(
28 |                     bsz, seq_len, hidden_states.dtype).to(hidden_states.device)
29 | 
30 |         encoder_outputs = self.text_model.encoder(
31 |             inputs_embeds=hidden_states,
32 |             attention_mask=attention_mask,
33 |             causal_attention_mask=causal_attention_mask,
34 |             output_attentions=output_attentions,
35 |             output_hidden_states=output_hidden_states,
36 |             return_dict=return_dict,
37 |         )
38 | 
39 |         last_hidden_state = encoder_outputs[0]
40 |         last_hidden_state = self.text_model.final_layer_norm(last_hidden_state)
41 | 
42 |         # text_embeds.shape = [batch_size, sequence_length, transformer.width]
43 |         # take features from the eot embedding (eot_token is the highest number in each sequence)
44 |         # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
45 |         pooled_output = last_hidden_state[
46 |             torch.arange(last_hidden_state.shape[0], device=input_ids.device),
47 |             input_ids.to(torch.int).argmax(dim=-1)
48 |         ]
49 | 
50 |         if not return_dict:
51 |             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
52 | 
53 |         return BaseModelOutputWithPooling(
54 |             last_hidden_state=last_hidden_state,
55 |             pooler_output=pooled_output,
56 |             hidden_states=encoder_outputs.hidden_states,
57 |             attentions=encoder_outputs.attentions,
58 |         )
59 | 
60 |     def get_text_feature_by_embedding(self, hidden_states, input_ids):
61 |         text_outputs = self.encode_text_feature(hidden_states, input_ids)
62 |         pooled_output = text_outputs[1]
63 |         text_features = self.text_projection(pooled_output)
64 | 
65 |         return text_features
66 | 
67 |     def forward_text_embedding(self, embeddings, ids, image_features,
68 |                                object_mask=None,
69 |                                ori_feature=None):
70 |         text_features = self.get_text_feature_by_embedding(embeddings, ids)
71 |         mse = torch.nn.MSELoss(reduction="sum").to(embeddings.device)
72 | 
73 |         # normalized features
74 |         image_features = image_features / image_features.norm(dim=1, keepdim=True)
75 |         text_features = text_features / text_features.norm(dim=1, keepdim=True)
76 | 
77 |         if object_mask is not None:  # keep the ori object feature
78 |             assert ori_feature is not None, "style task must input original prompt feature when computing the object loss"
79 |             mse_l = mse(text_features * (1 - object_mask), ori_feature * (1 - object_mask))
80 |             # text_features.register_hook(lambda grad: grad * mask.float())
81 |         else:
82 |             mse_l = mse(image_features.mean(dim=0, keepdim=True), text_features)
83 | 
84 |         logits_per_image = image_features @ text_features.t()
85 |         logits_per_text = logits_per_image.t()
86 | 
87 |         return logits_per_image, logits_per_text, mse_l
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/modified_stable_diffusion_pipeline.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, List, Optional, Union
  2 | 
  3 | import torch
  4 | from diffusers import StableDiffusionPipeline
  5 | from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
  6 | from diffusers.utils import logging
  7 | from transformers.modeling_outputs import BaseModelOutputWithPooling
  8 | 
  9 | logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 10 | 
 11 | class ModifiedStableDiffusionPipeline(StableDiffusionPipeline):
 12 |     def __init__(self,
 13 |         vae,
 14 |         text_encoder,
 15 |         tokenizer,
 16 |         unet,
 17 |         scheduler,
 18 |         safety_checker,
 19 |         feature_extractor,
 20 |         requires_safety_checker: bool = True,
 21 |     ):
 22 |         super(ModifiedStableDiffusionPipeline, self).__init__(vae,
 23 |                 text_encoder,
 24 |                 tokenizer,
 25 |                 unet,
 26 |                 scheduler,
 27 |                 safety_checker,
 28 |                 feature_extractor,
 29 |                 requires_safety_checker)
 30 | 
 31 |     def _encode_embeddings(self, prompt, prompt_embeddings, attention_mask=None):
 32 |         output_attentions = self.text_encoder.text_model.config.output_attentions
 33 |         output_hidden_states = (
 34 |             self.text_encoder.text_model.config.output_hidden_states
 35 |         )
 36 |         return_dict = self.text_encoder.text_model.config.use_return_dict
 37 | 
 38 |         hidden_states = self.text_encoder.text_model.embeddings(inputs_embeds=prompt_embeddings)
 39 | 
 40 |         bsz, seq_len = prompt.shape[0], prompt.shape[1]
 41 |         # CLIP's text model uses causal mask, prepare it here.
 42 |         # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
 43 |         causal_attention_mask = self.text_encoder.text_model._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
 44 |             hidden_states.device
 45 |         )
 46 |         # expand attention_mask
 47 |         if attention_mask is not None:
 48 |             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
 49 |             attention_mask = self.text_encoder.text_model._expand_mask(attention_mask, hidden_states.dtype)
 50 | 
 51 |         encoder_outputs = self.text_encoder.text_model.encoder(
 52 |             inputs_embeds=hidden_states,
 53 |             attention_mask=attention_mask,
 54 |             causal_attention_mask=causal_attention_mask,
 55 |             output_attentions=output_attentions,
 56 |             output_hidden_states=output_hidden_states,
 57 |             return_dict=return_dict,
 58 |         )
 59 | 
 60 |         last_hidden_state = encoder_outputs[0]
 61 |         last_hidden_state = self.text_encoder.text_model.final_layer_norm(last_hidden_state)
 62 | 
 63 |         # text_embeds.shape = [batch_size, sequence_length, transformer.width]
 64 |         # take features from the eot embedding (eot_token is the highest number in each sequence)
 65 |         # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
 66 |         pooled_output = last_hidden_state[
 67 |             torch.arange(last_hidden_state.shape[0], device=prompt.device), prompt.to(torch.int).argmax(dim=-1)
 68 |         ]
 69 | 
 70 |         if not return_dict:
 71 |             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
 72 | 
 73 |         return BaseModelOutputWithPooling(
 74 |             last_hidden_state=last_hidden_state,
 75 |             pooler_output=pooled_output,
 76 |             hidden_states=encoder_outputs.hidden_states,
 77 |             attentions=encoder_outputs.attentions,
 78 |         )
 79 | 
 80 |     def _get_text_embedding_with_embeddings(self, prompt_ids, prompt_embeddings, attention_mask=None):
 81 |         text_embeddings = self._encode_embeddings(
 82 |             prompt_ids,
 83 |             prompt_embeddings,
 84 |             attention_mask=attention_mask,
 85 |         )
 86 |         
 87 |         return text_embeddings[0]
 88 | 
 89 |     def _new_encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance,
 90 |                            negative_prompt, prompt_ids=None, prompt_embeddings=None, delete_id=None):
 91 |         r"""
 92 |         Encodes the prompt into text encoder hidden states.
 93 | 
 94 |         Args:
 95 |             prompt (`str` or `list(int)`):
 96 |                 prompt to be encoded
 97 |             device: (`torch.device`):
 98 |                 torch device
 99 |             num_images_per_prompt (`int`):
100 |                 number of images that should be generated per prompt
101 |             do_classifier_free_guidance (`bool`):
102 |                 whether to use classifier free guidance or not
103 |             negative_prompt (`str` or `List[str]`):
104 |                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
105 |                 if `guidance_scale` is less than `1`).
106 |         """
107 |         batch_size = len(prompt) if isinstance(prompt, list) else 1
108 | 
109 |         if prompt_embeddings is not None:
110 |             if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
111 |                 attention_mask = text_inputs.attention_mask.to(device)
112 |             else:
113 |                 attention_mask = None
114 | 
115 |             text_embeddings = self._encode_embeddings(
116 |                 prompt_ids,
117 |                 prompt_embeddings,
118 |                 attention_mask=attention_mask,
119 |             )
120 |             text_input_ids = prompt_ids
121 |         else:
122 |             text_inputs = self.tokenizer(
123 |                 prompt,
124 |                 padding="max_length",
125 |                 max_length=self.tokenizer.model_max_length,
126 |                 truncation=True,
127 |                 return_tensors="pt",
128 |             )
129 |             text_input_ids = text_inputs.input_ids
130 |             untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
131 | 
132 |             if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
133 |                 removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
134 |                 logger.warning(
135 |                     "The following part of your input was truncated because CLIP can only handle sequences up to"
136 |                     f" {self.tokenizer.model_max_length} tokens: {removed_text}"
137 |                 )
138 | 
139 |             if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
140 |                 attention_mask = text_inputs.attention_mask.to(device)
141 |             else:
142 |                 attention_mask = None
143 | 
144 |             text_embeddings = self.text_encoder(
145 |                 text_input_ids.to(device),
146 |                 attention_mask=attention_mask,
147 |             )
148 |         text_embeddings = text_embeddings[0]
149 | 
150 |         # duplicate text embeddings for each generation per prompt, using mps friendly method
151 |         bs_embed, seq_len, _ = text_embeddings.shape
152 |         text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
153 |         text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
154 | 
155 |         # get unconditional embeddings for classifier free guidance
156 |         if do_classifier_free_guidance:
157 |             uncond_tokens: List[str]
158 |             if negative_prompt is None:
159 |                 uncond_tokens = [""] * batch_size
160 |             elif type(prompt) is not type(negative_prompt):
161 |                 raise TypeError(
162 |                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
163 |                     f" {type(prompt)}."
164 |                 )
165 |             elif isinstance(negative_prompt, str):
166 |                 uncond_tokens = [negative_prompt]
167 |             elif batch_size != len(negative_prompt):
168 |                 raise ValueError(
169 |                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
170 |                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
171 |                     " the batch size of `prompt`."
172 |                 )
173 |             else:
174 |                 uncond_tokens = negative_prompt
175 | 
176 |             max_length = text_input_ids.shape[-1]
177 |             uncond_input = self.tokenizer(
178 |                 uncond_tokens,
179 |                 padding="max_length",
180 |                 max_length=max_length,
181 |                 truncation=True,
182 |                 return_tensors="pt",
183 |             )
184 | 
185 |             if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
186 |                 attention_mask = uncond_input.attention_mask.to(device)
187 |             else:
188 |                 attention_mask = None
189 | 
190 |             uncond_embeddings = self.text_encoder(
191 |                 uncond_input.input_ids.to(device),
192 |                 attention_mask=attention_mask,
193 |             )
194 |             uncond_embeddings = uncond_embeddings[0]
195 | 
196 |             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
197 |             seq_len = uncond_embeddings.shape[1]
198 |             uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
199 |             uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
200 | 
201 |             # For classifier free guidance, we need to do two forward passes.
202 |             # Here we concatenate the unconditional and text embeddings into a single batch
203 |             # to avoid doing two forward passes
204 |             text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
205 | 
206 |         if delete_id is not None:
207 |             mask = torch.ones(77).bool().to(text_embeddings.device)
208 |             id_mask = torch.LongTensor(delete_id).to(text_embeddings.device)
209 |             mask[id_mask] = False
210 |             text_embeddings = text_embeddings[:,mask,:]
211 | 
212 |         return text_embeddings
213 | 
214 |     @torch.no_grad()
215 |     def __call__(
216 |         self,
217 |         prompt: Union[str, List[str]],
218 |         height: Optional[int] = None,
219 |         width: Optional[int] = None,
220 |         num_inference_steps: int = 50,
221 |         guidance_scale: float = 7.5,
222 |         negative_prompt: Optional[Union[str, List[str]]] = None,
223 |         num_images_per_prompt: Optional[int] = 1,
224 |         eta: float = 0.0,
225 |         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
226 |         latents: Optional[torch.FloatTensor] = None,
227 |         output_type: Optional[str] = "pil",
228 |         return_dict: bool = True,
229 |         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
230 |         callback_steps: Optional[int] = 1,
231 |         prompt_ids = None,
232 |         prompt_embeddings = None,
233 |         return_latents = False,
234 |     ):
235 |         r"""
236 |         Function invoked when calling the pipeline for generation.
237 |         Args:
238 |             prompt (`str` or `List[str]`):
239 |                 The prompt or prompts to guide the image generation.
240 |             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
241 |                 The height in pixels of the generated image.
242 |             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
243 |                 The width in pixels of the generated image.
244 |             num_inference_steps (`int`, *optional*, defaults to 50):
245 |                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
246 |                 expense of slower inference.
247 |             guidance_scale (`float`, *optional*, defaults to 7.5):
248 |                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
249 |                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
250 |                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
251 |                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
252 |                 usually at the expense of lower image quality.
253 |             negative_prompt (`str` or `List[str]`, *optional*):
254 |                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
255 |                 if `guidance_scale` is less than `1`).
256 |             num_images_per_prompt (`int`, *optional*, defaults to 1):
257 |                 The number of images to generate per prompt.
258 |             eta (`float`, *optional*, defaults to 0.0):
259 |                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
260 |                 [`schedulers.DDIMScheduler`], will be ignored for others.
261 |             generator (`torch.Generator`, *optional*):
262 |                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
263 |                 to make generation deterministic.
264 |             latents (`torch.FloatTensor`, *optional*):
265 |                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
266 |                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
267 |                 tensor will ge generated by sampling using the supplied random `generator`.
268 |             output_type (`str`, *optional*, defaults to `"pil"`):
269 |                 The output format of the generate image. Choose between
270 |                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
271 |             return_dict (`bool`, *optional*, defaults to `True`):
272 |                 Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
273 |                 plain tuple.
274 |             callback (`Callable`, *optional*):
275 |                 A function that will be called every `callback_steps` steps during inference. The function will be
276 |                 called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
277 |             callback_steps (`int`, *optional*, defaults to 1):
278 |                 The frequency at which the `callback` function will be called. If not specified, the callback will be
279 |                 called at every step.
280 |         Examples:
281 |         Returns:
282 |             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
283 |             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
284 |             When returning a tuple, the first element is a list with the generated images, and the second element is a
285 |             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
286 |             (nsfw) content, according to the `safety_checker`.
287 |         """
288 |         # 0. Default height and width to unet
289 |         height = height or self.unet.config.sample_size * self.vae_scale_factor
290 |         width = width or self.unet.config.sample_size * self.vae_scale_factor
291 | 
292 |         # 1. Check inputs. Raise error if not correct
293 |         self.check_inputs(prompt, height, width, callback_steps)
294 | 
295 |         # 2. Define call parameters
296 |         batch_size = 1 if isinstance(prompt, str) else len(prompt)
297 |         device = self._execution_device
298 |         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
299 |         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
300 |         # corresponds to doing no classifier free guidance.
301 |         do_classifier_free_guidance = guidance_scale > 1.0
302 | 
303 |         # 3. Encode input prompt
304 |         text_embeddings = self._new_encode_prompt(
305 |             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, prompt_ids, prompt_embeddings
306 |         )
307 | 
308 |         # 4. Prepare timesteps
309 |         self.scheduler.set_timesteps(num_inference_steps, device=device)
310 |         timesteps = self.scheduler.timesteps
311 | 
312 |         # 5. Prepare latent variables
313 |         num_channels_latents = self.unet.in_channels
314 |         latents = self.prepare_latents(
315 |             batch_size * num_images_per_prompt,
316 |             num_channels_latents,
317 |             height,
318 |             width,
319 |             text_embeddings.dtype,
320 |             device,
321 |             generator,
322 |             latents,
323 |         )
324 | 
325 |         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
326 |         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
327 | 
328 |         # save latents
329 |         intermediate_latents = []
330 |         intermediate_latents.append(latents)
331 |         # 7. Denoising loop
332 |         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
333 |         with self.progress_bar(total=num_inference_steps) as progress_bar:
334 |             for i, t in enumerate(timesteps):
335 |                 # expand the latents if we are doing classifier free guidance
336 |                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
337 |                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
338 | 
339 |                 # predict the noise residual
340 |                 noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
341 | 
342 |                 # perform guidance
343 |                 if do_classifier_free_guidance:
344 |                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
345 |                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
346 | 
347 |                 # compute the previous noisy sample x_t -> x_t-1
348 |                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
349 |                 intermediate_latents.append(latents)
350 |                 # call the callback, if provided
351 |                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
352 |                     progress_bar.update()
353 |                     if callback is not None and i % callback_steps == 0:
354 |                         callback(i, t, latents)
355 | 
356 |         if return_latents:
357 |             return latents
358 | 
359 |         # 8. Post-processing
360 |         image = self.decode_latents(latents)
361 | 
362 |         # 9. Run safety checker
363 |         # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
364 |         has_nsfw_concept = None
365 |         # 10. Convert to PIL
366 |         if output_type == "pil":
367 |             image = self.numpy_to_pil(image)
368 | 
369 |         if not return_dict:
370 |             return (image, None)
371 | 
372 |         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
373 | 
374 |     @torch.no_grad()
375 |     def forward_intermediate(
376 |             self,
377 |             prompt: Union[str, List[str]],
378 |             delete_id: Optional[List] = None,
379 |             height: Optional[int] = None,
380 |             width: Optional[int] = None,
381 |             num_inference_steps: int = 50,
382 |             guidance_scale: float = 7.5,
383 |             negative_prompt: Optional[Union[str, List[str]]] = None,
384 |             num_images_per_prompt: Optional[int] = 1,
385 |             eta: float = 0.0,
386 |             generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
387 |             latents: Optional[torch.FloatTensor] = None,
388 |             output_type: Optional[str] = "pil",
389 |             return_dict: bool = True,
390 |             callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
391 |             callback_steps: Optional[int] = 1,
392 |             prompt_ids=None,
393 |             prompt_embeddings=None,
394 |             return_latents=False,
395 |     ):
396 |         r"""
397 |         Function invoked when calling the pipeline for generation.
398 |         Args:
399 |             prompt (`str` or `List[str]`):
400 |                 The prompt or prompts to guide the image generation.
401 |             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
402 |                 The height in pixels of the generated image.
403 |             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
404 |                 The width in pixels of the generated image.
405 |             num_inference_steps (`int`, *optional*, defaults to 50):
406 |                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
407 |                 expense of slower inference.
408 |             guidance_scale (`float`, *optional*, defaults to 7.5):
409 |                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
410 |                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
411 |                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
412 |                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
413 |                 usually at the expense of lower image quality.
414 |             negative_prompt (`str` or `List[str]`, *optional*):
415 |                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
416 |                 if `guidance_scale` is less than `1`).
417 |             num_images_per_prompt (`int`, *optional*, defaults to 1):
418 |                 The number of images to generate per prompt.
419 |             eta (`float`, *optional*, defaults to 0.0):
420 |                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
421 |                 [`schedulers.DDIMScheduler`], will be ignored for others.
422 |             generator (`torch.Generator`, *optional*):
423 |                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
424 |                 to make generation deterministic.
425 |             latents (`torch.FloatTensor`, *optional*):
426 |                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
427 |                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
428 |                 tensor will ge generated by sampling using the supplied random `generator`.
429 |             output_type (`str`, *optional*, defaults to `"pil"`):
430 |                 The output format of the generate image. Choose between
431 |                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
432 |             return_dict (`bool`, *optional*, defaults to `True`):
433 |                 Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
434 |                 plain tuple.
435 |             callback (`Callable`, *optional*):
436 |                 A function that will be called every `callback_steps` steps during inference. The function will be
437 |                 called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
438 |             callback_steps (`int`, *optional*, defaults to 1):
439 |                 The frequency at which the `callback` function will be called. If not specified, the callback will be
440 |                 called at every step.
441 |         Examples:
442 |         Returns:
443 |             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
444 |             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
445 |             When returning a tuple, the first element is a list with the generated images, and the second element is a
446 |             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
447 |             (nsfw) content, according to the `safety_checker`.
448 |         """
449 |         # 0. Default height and width to unet
450 |         height = height or self.unet.config.sample_size * self.vae_scale_factor
451 |         width = width or self.unet.config.sample_size * self.vae_scale_factor
452 | 
453 |         # 1. Check inputs. Raise error if not correct
454 |         self.check_inputs(prompt, height, width, callback_steps)
455 | 
456 |         # 2. Define call parameters
457 |         batch_size = 1 if isinstance(prompt, str) else len(prompt)
458 |         device = self._execution_device
459 |         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
460 |         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
461 |         # corresponds to doing no classifier free guidance.
462 |         do_classifier_free_guidance = guidance_scale > 1.0
463 | 
464 |         # 3. Encode input prompt
465 |         text_embeddings = self._new_encode_prompt(
466 |             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, prompt_ids,
467 |             prompt_embeddings, delete_id
468 |         )
469 | 
470 |         # 4. Prepare timesteps
471 |         self.scheduler.set_timesteps(num_inference_steps, device=device)
472 |         timesteps = self.scheduler.timesteps
473 | 
474 |         # 5. Prepare latent variables
475 |         num_channels_latents = self.unet.in_channels
476 |         latents = self.prepare_latents(
477 |             batch_size * num_images_per_prompt,
478 |             num_channels_latents,
479 |             height,
480 |             width,
481 |             text_embeddings.dtype,
482 |             device,
483 |             generator,
484 |             latents,
485 |         )
486 | 
487 |         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
488 |         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
489 | 
490 |         # save latents
491 |         intermediate_latents = []
492 |         intermediate_latents.append(latents)
493 |         # 7. Denoising loop
494 |         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
495 |         with self.progress_bar(total=num_inference_steps) as progress_bar:
496 |             for i, t in enumerate(timesteps):
497 |                 # expand the latents if we are doing classifier free guidance
498 |                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
499 |                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
500 | 
501 |                 # predict the noise residual
502 |                 noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
503 | 
504 |                 # perform guidance
505 |                 if do_classifier_free_guidance:
506 |                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
507 |                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
508 | 
509 |                 # compute the previous noisy sample x_t -> x_t-1
510 |                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
511 |                 intermediate_latents.append(latents)
512 |                 # call the callback, if provided
513 |                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
514 |                     progress_bar.update()
515 |                     if callback is not None and i % callback_steps == 0:
516 |                         callback(i, t, latents)
517 | 
518 |         if return_latents:
519 |             return latents
520 | 
521 |         # 8. Post-processing
522 |         images = []
523 |         for latents in intermediate_latents:
524 |             image = self.decode_latents(latents)
525 |             images.append(image)
526 | 
527 |         # 9. Run safety checker
528 |         # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
529 |         has_nsfw_concept = None
530 |         # 10. Convert to PIL
531 |         pil_images = []
532 |         if output_type == "pil":
533 |             for image in images:
534 |                 image = self.numpy_to_pil(image)
535 |                 pil_images.append(image)
536 | 
537 |         if not return_dict:
538 |             return (pil_images, None)
539 | 
540 |         return StableDiffusionPipelineOutput(images=pil_images, nsfw_content_detected=None)
541 | 
542 |     @torch.no_grad()
543 |     def gen_images_by_selected_token_id(
544 |             self,
545 |             prompt: Union[str, List[str]],
546 |             delete_id: Optional[List]=None,
547 |             exchange_latent_step: Optional[int]=None,
548 |             height: Optional[int] = None,
549 |             width: Optional[int] = None,
550 |             num_inference_steps: int = 50,
551 |             guidance_scale: float = 7.5,
552 |             negative_prompt: Optional[Union[str, List[str]]] = None,
553 |             num_images_per_prompt: Optional[int] = 1,
554 |             eta: float = 0.0,
555 |             generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
556 |             latents: Optional[torch.FloatTensor] = None,
557 |             output_type: Optional[str] = "pil",
558 |             return_dict: bool = True,
559 |             callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
560 |             callback_steps: Optional[int] = 1,
561 |             prompt_ids=None,
562 |             prompt_embeddings=None,
563 |             return_latents=False,
564 |     ):
565 |         r"""
566 |         Function invoked when calling the pipeline for generation.
567 |         Args:
568 |             prompt (`str` or `List[str]`):
569 |                 The prompt or prompts to guide the image generation.
570 |             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
571 |                 The height in pixels of the generated image.
572 |             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
573 |                 The width in pixels of the generated image.
574 |             num_inference_steps (`int`, *optional*, defaults to 50):
575 |                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
576 |                 expense of slower inference.
577 |             guidance_scale (`float`, *optional*, defaults to 7.5):
578 |                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
579 |                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
580 |                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
581 |                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
582 |                 usually at the expense of lower image quality.
583 |             negative_prompt (`str` or `List[str]`, *optional*):
584 |                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
585 |                 if `guidance_scale` is less than `1`).
586 |             num_images_per_prompt (`int`, *optional*, defaults to 1):
587 |                 The number of images to generate per prompt.
588 |             eta (`float`, *optional*, defaults to 0.0):
589 |                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
590 |                 [`schedulers.DDIMScheduler`], will be ignored for others.
591 |             generator (`torch.Generator`, *optional*):
592 |                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
593 |                 to make generation deterministic.
594 |             latents (`torch.FloatTensor`, *optional*):
595 |                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
596 |                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
597 |                 tensor will ge generated by sampling using the supplied random `generator`.
598 |             output_type (`str`, *optional*, defaults to `"pil"`):
599 |                 The output format of the generate image. Choose between
600 |                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
601 |             return_dict (`bool`, *optional*, defaults to `True`):
602 |                 Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
603 |                 plain tuple.
604 |             callback (`Callable`, *optional*):
605 |                 A function that will be called every `callback_steps` steps during inference. The function will be
606 |                 called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
607 |             callback_steps (`int`, *optional*, defaults to 1):
608 |                 The frequency at which the `callback` function will be called. If not specified, the callback will be
609 |                 called at every step.
610 |         Examples:
611 |         Returns:
612 |             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
613 |             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
614 |             When returning a tuple, the first element is a list with the generated images, and the second element is a
615 |             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
616 |             (nsfw) content, according to the `safety_checker`.
617 |         """
618 |         # 0. Default height and width to unet
619 |         height = height or self.unet.config.sample_size * self.vae_scale_factor
620 |         width = width or self.unet.config.sample_size * self.vae_scale_factor
621 | 
622 |         # 1. Check inputs. Raise error if not correct
623 |         self.check_inputs(prompt, height, width, callback_steps)
624 | 
625 |         # 2. Define call parameters
626 |         batch_size = 1 if isinstance(prompt, str) else len(prompt)
627 |         device = self._execution_device
628 |         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
629 |         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
630 |         # corresponds to doing no classifier free guidance.
631 |         do_classifier_free_guidance = guidance_scale > 1.0
632 | 
633 |         # 3. Encode input prompt
634 |         text_embeddings = self._new_encode_prompt(
635 |             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, prompt_ids,
636 |             prompt_embeddings, delete_id
637 |         )
638 | 
639 |         # 4. Prepare timesteps
640 |         self.scheduler.set_timesteps(num_inference_steps, device=device)
641 |         timesteps = self.scheduler.timesteps
642 | 
643 |         # 5. Prepare latent variables
644 |         num_channels_latents = self.unet.in_channels
645 |         latents = self.prepare_latents(
646 |             batch_size * num_images_per_prompt,
647 |             num_channels_latents,
648 |             height,
649 |             width,
650 |             text_embeddings.dtype,
651 |             device,
652 |             generator,
653 |             latents,
654 |         )
655 | 
656 |         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
657 |         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
658 | 
659 |         # save latents
660 |         intermediate_latents = []
661 |         intermediate_latents.append(latents)
662 |         # 7. Denoising loop
663 |         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
664 |         with self.progress_bar(total=num_inference_steps) as progress_bar:
665 |             for i, t in enumerate(timesteps):
666 |                 # expand the latents if we are doing classifier free guidance
667 |                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
668 |                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
669 | 
670 |                 # predict the noise residual
671 |                 noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
672 | 
673 |                 # perform guidance
674 |                 if do_classifier_free_guidance:
675 |                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
676 |                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
677 | 
678 |                 # compute the previous noisy sample x_t -> x_t-1
679 |                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
680 |                 intermediate_latents.append(latents)
681 |                 # call the callback, if provided
682 |                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
683 |                     progress_bar.update()
684 |                     if callback is not None and i % callback_steps == 0:
685 |                         callback(i, t, latents)
686 |                 if exchange_latent_step is not None:
687 |                     import pdb
688 |                     pdb.set_trace()
689 |                     if i == exchange_latent_step:
690 |                         pass
691 | 
692 |         if return_latents:
693 |             return latents
694 | 
695 |         # 8. Post-processing
696 |         image = self.decode_latents(latents)
697 | 
698 |         # 9. Run safety checker
699 |         # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
700 |         has_nsfw_concept = None
701 |         # 10. Convert to PIL
702 |         if output_type == "pil":
703 |             image = self.numpy_to_pil(image)
704 | 
705 |         if not return_dict:
706 |             return (image, None)
707 | 
708 |         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)


--------------------------------------------------------------------------------
/object_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "add_suffix_num": 5,
 3 |     "replace_type": [],
 4 |     "synonym_num": 10,
 5 |     "iter": 500,
 6 |     "lr": 0.1,
 7 |     "weight_decay": 0.1,
 8 |     "loss_weight": 1.0,
 9 |     "print_step": 100,
10 |     "batch_size": 1,
11 |     "clip_model": "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
12 |     "prompt_path": "simple_prompt.txt",
13 |     "task":"object",
14 |     "forbidden_words": "./referenced_images/objects/forbidden_words.txt",
15 |     "target_path":"./referenced_images/objects/target_object.txt",
16 |     "output_dir": "../runs/attack/formal_experiment/object/submit_test"
17 | }


--------------------------------------------------------------------------------
/optim_utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | from PIL import Image
  4 | import copy
  5 | import json
  6 | from typing import Any, Mapping
  7 | import torch
  8 | from synonym import is_english, get_token_english_mask
  9 | import pdb
 10 | 
 11 | def read_json(filename: str) -> Mapping[str, Any]:
 12 |     """Returns a Python dict representation of JSON object at input file."""
 13 |     with open(filename) as fp:
 14 |         return json.load(fp)
 15 | 
 16 | 
 17 | def nn_project(curr_embeds, embedding_layer, forbidden_mask=None, forbidden_set=None, english_mask=None):
 18 |     with torch.no_grad():
 19 |         bsz,seq_len,emb_dim = curr_embeds.shape
 20 | 
 21 |         curr_embeds = curr_embeds.reshape((-1,emb_dim))
 22 |         curr_embeds = curr_embeds / curr_embeds.norm(dim=1, keepdim=True) # queries
 23 | 
 24 |         embedding_matrix = embedding_layer.weight
 25 |         embedding_matrix = embedding_matrix / embedding_matrix.norm(dim=1, keepdim=True)
 26 | 
 27 |         sims = torch.mm(curr_embeds, embedding_matrix.transpose(0, 1))
 28 | 
 29 |         if forbidden_mask is not None:
 30 |             sims[:, forbidden_mask] = -1e+8
 31 |             forbidden_num = len(forbidden_set)
 32 |         else:
 33 |             forbidden_num = 0
 34 | 
 35 |         if english_mask is not None:
 36 |             sims[:, english_mask] = -1e+8
 37 | 
 38 |         cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(sims, max(1, forbidden_num+1), dim=1, largest=True, sorted=False)  #
 39 |         queries_result_list = []
 40 |         for idx in range(seq_len):
 41 |             cur_query_idx_topk = sorted([[top_k_id.item(), top_k_value.item()] for top_k_id, top_k_value in
 42 |                                          zip(cos_scores_top_k_idx[idx], cos_scores_top_k_values[idx])],
 43 |                                         key=lambda x:x[1], reverse=True)
 44 |             for token_id, sim_value in cur_query_idx_topk:
 45 |                 if token_id not in forbidden_set:
 46 |                     queries_result_list.append(token_id)
 47 |                     break
 48 |         nn_indices = torch.tensor(queries_result_list,
 49 |                                   device=curr_embeds.device).reshape((bsz, seq_len)).long()
 50 | 
 51 |         projected_embeds = embedding_layer(nn_indices)
 52 | 
 53 |     return projected_embeds, nn_indices
 54 | 
 55 | def forbidden_mask(forbidden_words, tokenizer, device):
 56 | 
 57 |     mask = torch.zeros(len(tokenizer.encoder)).bool().to(device)
 58 |     squeeze_forbidden = set()
 59 |     if forbidden_words is not None:
 60 |         forbidden_token = [tokenizer._convert_token_to_id(word) for word in forbidden_words]
 61 |         forbidden_token.extend([tokenizer._convert_token_to_id(word + "</w>") for word in forbidden_words])
 62 |         for token in forbidden_token:
 63 |             squeeze_forbidden.add(token)
 64 |             mask[token] = 1
 65 |     return mask, squeeze_forbidden
 66 | 
 67 | 
 68 | def set_random_seed(seed=0):
 69 |     torch.manual_seed(seed + 0)
 70 |     torch.cuda.manual_seed(seed + 1)
 71 |     torch.cuda.manual_seed_all(seed + 2)
 72 |     np.random.seed(seed + 3)
 73 |     torch.cuda.manual_seed_all(seed + 4)
 74 |     random.seed(seed + 5)
 75 | 
 76 | 
 77 | def decode_ids(input_ids, tokenizer):
 78 |     input_ids = input_ids.detach().cpu().numpy()
 79 |     texts = []
 80 |     token_text = []
 81 |     for ids in input_ids:
 82 |         tokens = [tokenizer._convert_id_to_token(int(id_)) for id_ in ids]
 83 |         texts.append(tokenizer.convert_tokens_to_string(tokens))
 84 |         token_text.append([token.replace("</w>", " ") for token in tokens])
 85 | 
 86 |     return texts, token_text
 87 | 
 88 | 
 89 | def get_target_feature(model, preprocess, tokenizer, device, target_images=None, target_prompts=None):
 90 |     if target_images is not None:
 91 |         with torch.no_grad():
 92 |             images = preprocess(images=target_images, return_tensors="pt").pixel_values
 93 |             image_features = model.get_image_features(pixel_values=images.to(device))
 94 |             # normalized features
 95 |             all_target_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
 96 |     else:
 97 |         text_input = tokenizer(
 98 |             target_prompts, padding=True, return_tensors="pt")
 99 |         with torch.no_grad():
100 |             all_target_features = model.get_text_features(text_input.input_ids.to(device))
101 |             all_target_features = all_target_features / all_target_features.norm(p=2, dim=-1, keepdim=True)
102 |     return all_target_features
103 | 
104 | def get_id(prompt, suffix_num, tokenizer, device, token_embedding):
105 | 
106 |     dummy_ids = [tokenizer._convert_token_to_id(token) for token in tokenizer._tokenize(prompt)]
107 |     prompt_len = len(dummy_ids)
108 |     assert prompt_len + suffix_num < 76, "opti_num + len(prompt) must < 77"
109 |     padded_template_text = '{}'.format(" ".join(["<|startoftext|>"] * (suffix_num)))
110 |     # <|startoftext|> for transformer clip Autotokenizer
111 |     # <start_of_text> for openclip
112 | 
113 |     # dummy_ids.extend(tokenizer.encode(padded_template_text))
114 |     dummy_ids.extend([tokenizer.encoder[token] for token in tokenizer._tokenize(padded_template_text)])
115 |     dummy_ids = [i if i != 49406 else -1 for i in dummy_ids]
116 |     dummy_ids = [49406] + dummy_ids + [49407]
117 |     # dummy_ids += [0] * (77 - len(dummy_ids))
118 |     dummy_ids = torch.tensor([dummy_ids]).to(device)
119 | 
120 |     # for getting dummy embeds; -1 won't work for token_embedding
121 |     tmp_dummy_ids = copy.deepcopy(dummy_ids)
122 |     tmp_dummy_ids[tmp_dummy_ids == -1] = 0
123 |     dummy_embeds = token_embedding(tmp_dummy_ids).detach()  # get embedding of initial template, no grad
124 |     dummy_embeds.requires_grad = False
125 | 
126 |     opti_num = (dummy_ids == -1).sum()
127 |     prompt_ids = torch.randint(len(tokenizer.encoder), (1, opti_num)).to(device)
128 |     prompt_embeds = token_embedding(prompt_ids).detach()
129 |     prompt_embeds.requires_grad = True
130 | 
131 |     return prompt_embeds, dummy_embeds, dummy_ids, prompt_len + suffix_num
132 | 
133 | 
134 | def optimize_prompt_loop(model, tokenizer, all_target_features,
135 |                          args, device, ori_prompt=None, forbidden_words=None,
136 |                          suffix_num=10, object_mask=None, english_mask=None,
137 |                          ori_feature=None, writer=None):
138 |     assert ori_prompt is not None
139 |     opt_iters = args.iter
140 |     lr = args.lr
141 |     weight_decay = args.weight_decay
142 |     print_step = args.print_step
143 |     batch_size = args.batch_size
144 |     top_k = 50
145 |     prompt_topk = [[-1, 0, "", ""] for _ in range(top_k)]
146 |     save_prompts = []
147 |     top_k_sim_min = 0.
148 | 
149 | 
150 |     token_embedding = model.text_model.embeddings.token_embedding
151 | 
152 |     # forbidden
153 |     forbidden_mask_, squeeze_forbidden = forbidden_mask(forbidden_words, tokenizer, device)
154 | 
155 |     # initialize prompt
156 |     prompt_embeds, dummy_embeds, dummy_ids, max_char_num = get_id(ori_prompt, suffix_num, tokenizer, device, token_embedding)
157 |     p_bs, p_len, p_dim = prompt_embeds.shape
158 | 
159 |     # get optimizer
160 |     input_optimizer = torch.optim.AdamW([prompt_embeds], lr=lr, weight_decay=weight_decay)
161 | 
162 |     best_sim = -1000 * args.loss_weight
163 |     best_text = ""
164 | 
165 |     for step in range(opt_iters):
166 |         # randomly sample sample images and get features
167 |         if batch_size is None:
168 |             target_features = all_target_features
169 |         else:
170 |             curr_indx = torch.randperm(len(all_target_features))
171 |             target_features = all_target_features[curr_indx][0:batch_size]
172 |             
173 |         
174 |         # forward projection
175 |         projected_embeds, nn_indices = nn_project(prompt_embeds, token_embedding,
176 |                                                   forbidden_mask=forbidden_mask_, english_mask=english_mask,
177 |                                                   forbidden_set=squeeze_forbidden)
178 | 
179 |         # get cosine similarity score with all target features
180 |         with torch.no_grad():
181 |             # padded_embeds = copy.deepcopy(dummy_embeds)
182 |             padded_embeds = dummy_embeds.detach().clone()
183 |             padded_embeds[dummy_ids == -1] = projected_embeds.reshape(-1, p_dim)
184 |             logits_per_image, _, mse_loss = model.forward_text_embedding(padded_embeds,
185 |                                                                          dummy_ids,
186 |                                                                          target_features,
187 |                                                                          object_mask=object_mask,
188 |                                                                          ori_feature=ori_feature)
189 |             scores_per_prompt = logits_per_image.mean(dim=0)
190 |             universal_cosim_score = scores_per_prompt.max().item()  # max
191 |             best_indx = scores_per_prompt.argmax().item()
192 |         
193 |         # tmp_embeds = copy.deepcopy(prompt_embeds)
194 |         tmp_embeds = prompt_embeds.detach().clone()
195 |         tmp_embeds.data = projected_embeds.data
196 |         tmp_embeds.requires_grad = True
197 |         
198 |         # padding
199 |         # padded_embeds = copy.deepcopy(dummy_embeds)
200 |         padded_embeds = dummy_embeds.detach().clone()
201 |         padded_embeds[dummy_ids == -1] = tmp_embeds.reshape(-1, p_dim)
202 |         
203 |         logits_per_image, _, mse_loss = model.forward_text_embedding(padded_embeds,
204 |                                                                      dummy_ids,
205 |                                                                      target_features,
206 |                                                                      object_mask=object_mask,
207 |                                                                      ori_feature=ori_feature)
208 |         cosim_scores = logits_per_image
209 |         loss = 1 - cosim_scores.mean()
210 |         if object_mask is not None:
211 |             loss = loss + mse_loss  * args.loss_weight
212 |         if writer is not None:
213 |             writer.add_scalar('loss', loss.item(), step)
214 |         
215 |         prompt_embeds.grad, = torch.autograd.grad(loss, [tmp_embeds])
216 |         
217 |         input_optimizer.step()
218 |         input_optimizer.zero_grad()
219 | 
220 |         curr_lr = input_optimizer.param_groups[0]["lr"]
221 |         cosim_scores = cosim_scores.mean().item()
222 | 
223 |         target_id = dummy_ids.detach().clone()
224 |         target_id[dummy_ids == -1] = nn_indices.reshape(1, -1)
225 |         target_id = target_id[:, 1:max_char_num + 1]
226 |         decoded_texts, decoded_tokens = decode_ids(target_id, tokenizer)
227 |         decoded_text, decoded_token = decoded_texts[best_indx], decoded_tokens[best_indx]
228 | 
229 |         # save top k prompt
230 |         if cosim_scores >= top_k_sim_min and decoded_text not in save_prompts:
231 |             prompt_topk[0] = [cosim_scores, mse_loss.item(), decoded_text, decoded_token]
232 |             prompt_topk = sorted(prompt_topk, key=lambda x:x[0])
233 |             top_k_sim_min = prompt_topk[0][0]
234 |             save_prompts.append(decoded_text)
235 | 
236 |         if print_step is not None and (step % print_step == 0 or step == opt_iters-1):
237 |             per_step_message = f"step: {step}, lr: {curr_lr}"
238 |             per_step_message = f"\n{per_step_message}, " \
239 |                                f"mse: {mse_loss.item():.3f}, " \
240 |                                f"cosim: {cosim_scores:.3f}," \
241 |                                f" text: {decoded_text}, " \
242 |                                 f"token: {decoded_token}"
243 |             print(per_step_message)
244 | 
245 |         if best_sim * args.loss_weight < cosim_scores * args.loss_weight:
246 |             best_sim = cosim_scores
247 |             best_text = decoded_text
248 | 
249 |     return best_text, best_sim, prompt_topk
250 | 
251 | 
252 | def optimize_prompt(model, preprocess, tokenizer, args, device, target_images=None, target_prompts=None, ori_prompt=None,
253 |                     forbidden_words=None, suffix_num=10, object_mask=None, only_english_words=False, writer=None):
254 | 
255 |     # get target features
256 |     all_target_features = get_target_feature(model, preprocess, tokenizer, device, target_images=target_images,
257 |                                              target_prompts=target_prompts)
258 |     # get original prompt feature
259 |     with torch.no_grad():
260 |         text_input = tokenizer(
261 |             ori_prompt, padding=True, return_tensors="pt")
262 |         ori_feature = model.get_text_features(text_input.input_ids.to(device))
263 |         ori_feature = ori_feature / ori_feature.norm(p=2, dim=-1, keepdim=True)
264 | 
265 |     # only choose english
266 |     if only_english_words:
267 |         english_mask = get_token_english_mask(tokenizer.encoder, device)
268 |     else:
269 |         english_mask = None
270 | 
271 |     # optimize prompt
272 |     learned_prompt = optimize_prompt_loop(model, tokenizer,
273 |                                           all_target_features,
274 |                                           args, device, ori_prompt=ori_prompt,
275 |                                           forbidden_words=forbidden_words,
276 |                                           suffix_num=suffix_num, object_mask=object_mask,
277 |                                           english_mask=english_mask,
278 |                                           ori_feature=ori_feature,
279 |                                           writer=writer)
280 | 
281 |     return learned_prompt


--------------------------------------------------------------------------------
/perceptrontagger_model/averaged_perceptron_tagger.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/perceptrontagger_model/averaged_perceptron_tagger.pickle


--------------------------------------------------------------------------------
/pos_tagger.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import os
 3 | import torch.nn as nn
 4 | import numpy as np
 5 | import pdb
 6 | 
 7 | 
 8 | _POS_MAPPING = {
 9 |     "JJ": "adj",
10 |     "VB": "verb",
11 |     "NN": "noun",
12 |     "RB": "adv",
13 |     "IN": "prep",
14 |     "DT": "a(n)",
15 | }
16 | 
17 | class word_pos(nn.Module):
18 |     def __init__(self, model_path):
19 |         super(word_pos, self).__init__()
20 |         self.ret = nltk.tag.PerceptronTagger(load=False)
21 |         self.ret.load("file:" + os.path.join(model_path))
22 | 
23 | 
24 |     def modify_word(self, prompt, ori_object, pad_word, replace_type):
25 | 
26 |         prompt_words = prompt.replace(ori_object, "<pad>")  # fix the ori object
27 |         prompt_words = prompt_words.split()
28 |         for i, (word, pos) in enumerate(self.ret.tag(prompt_words)):
29 |             if replace_type[0] == "all":
30 |                 if word == "<pad>" or pos[:2] == "NN" or pos[:2] == "DT": # or pos[:2] == "IN"
31 |                     continue
32 |                 else:
33 |                     prompt_words[i] = pad_word
34 |             else:
35 |                 if word == "<pad>" or pos[:2] not in _POS_MAPPING or _POS_MAPPING[pos[:2]] not in replace_type:
36 |                     continue
37 |                 else:
38 |                     prompt_words[i] = pad_word
39 |         modified_prompt = " ".join(prompt_words)
40 |         modified_prompt = modified_prompt.replace("<pad>", ori_object)  # recover the ori object
41 | 
42 |         return modified_prompt
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     pos_model = word_pos(model_path="./perceptrontagger_model/averaged_perceptron_tagger.pickle")
47 |     pos_model.modify_word("a photo of cat", "cat", "<start_of_text>", replace_type=["verb", "adj", "adv"])
48 | 


--------------------------------------------------------------------------------
/referenced_images/objects/cock/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/cock/0.png


--------------------------------------------------------------------------------
/referenced_images/objects/cock/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/cock/1.png


--------------------------------------------------------------------------------
/referenced_images/objects/cock/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/cock/2.png


--------------------------------------------------------------------------------
/referenced_images/objects/cock/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/cock/3.png


--------------------------------------------------------------------------------
/referenced_images/objects/cock/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/cock/4.png


--------------------------------------------------------------------------------
/referenced_images/objects/cock/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/cock/5.png


--------------------------------------------------------------------------------
/referenced_images/objects/cock/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/cock/6.png


--------------------------------------------------------------------------------
/referenced_images/objects/cock/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/cock/7.png


--------------------------------------------------------------------------------
/referenced_images/objects/cock/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/cock/8.png


--------------------------------------------------------------------------------
/referenced_images/objects/cock/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/cock/9.png


--------------------------------------------------------------------------------
/referenced_images/objects/forbidden_words.txt:
--------------------------------------------------------------------------------
 1 | cock,rooster,chicken
 2 | toucan
 3 | mushroom
 4 | panda
 5 | peony
 6 | pizza
 7 | tractor
 8 | vampire
 9 | warplane
10 | zombie


--------------------------------------------------------------------------------
/referenced_images/objects/mushroom/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/mushroom/0.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/mushroom/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/mushroom/1.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/mushroom/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/mushroom/2.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/mushroom/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/mushroom/3.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/mushroom/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/mushroom/4.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/mushroom/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/mushroom/5.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/mushroom/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/mushroom/6.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/mushroom/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/mushroom/7.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/mushroom/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/mushroom/8.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/mushroom/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/mushroom/9.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/panda/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/panda/0.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/panda/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/panda/1.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/panda/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/panda/2.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/panda/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/panda/3.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/panda/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/panda/4.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/panda/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/panda/5.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/panda/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/panda/6.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/panda/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/panda/7.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/panda/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/panda/8.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/panda/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/panda/9.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/peony/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/peony/0.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/peony/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/peony/1.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/peony/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/peony/2.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/peony/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/peony/3.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/peony/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/peony/4.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/peony/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/peony/5.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/peony/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/peony/6.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/peony/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/peony/7.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/peony/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/peony/8.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/peony/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/peony/9.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/pizza/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/pizza/0.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/pizza/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/pizza/1.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/pizza/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/pizza/2.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/pizza/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/pizza/3.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/pizza/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/pizza/4.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/pizza/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/pizza/5.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/pizza/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/pizza/6.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/pizza/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/pizza/7.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/pizza/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/pizza/8.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/pizza/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/pizza/9.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/target_object.txt:
--------------------------------------------------------------------------------
 1 | /home/zcy/attack/diffusion_outputs/objects/cock
 2 | /home/zcy/attack/diffusion_outputs/objects/toucan
 3 | /home/zcy/attack/diffusion_outputs/objects/mushroom
 4 | /home/zcy/attack/diffusion_outputs/objects/panda
 5 | /home/zcy/attack/diffusion_outputs/objects/peony
 6 | /home/zcy/attack/diffusion_outputs/objects/pizza
 7 | /home/zcy/attack/diffusion_outputs/objects/tractor
 8 | /home/zcy/attack/diffusion_outputs/objects/vampire
 9 | /home/zcy/attack/diffusion_outputs/objects/warplane
10 | /home/zcy/attack/diffusion_outputs/objects/zombie


--------------------------------------------------------------------------------
/referenced_images/objects/toucan/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/toucan/0.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/toucan/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/toucan/1.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/toucan/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/toucan/2.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/toucan/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/toucan/3.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/toucan/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/toucan/4.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/toucan/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/toucan/5.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/toucan/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/toucan/6.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/toucan/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/toucan/7.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/toucan/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/toucan/8.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/toucan/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/toucan/9.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/tractor/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/tractor/0.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/tractor/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/tractor/1.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/tractor/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/tractor/2.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/tractor/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/tractor/3.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/tractor/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/tractor/4.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/tractor/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/tractor/5.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/tractor/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/tractor/6.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/tractor/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/tractor/7.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/tractor/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/tractor/8.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/tractor/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/tractor/9.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/vampire/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/vampire/0.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/vampire/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/vampire/1.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/vampire/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/vampire/2.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/vampire/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/vampire/3.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/vampire/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/vampire/4.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/vampire/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/vampire/5.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/vampire/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/vampire/6.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/vampire/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/vampire/7.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/vampire/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/vampire/8.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/vampire/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/vampire/9.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/warplane/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/warplane/0.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/warplane/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/warplane/1.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/warplane/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/warplane/2.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/warplane/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/warplane/3.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/warplane/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/warplane/4.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/warplane/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/warplane/5.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/warplane/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/warplane/6.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/warplane/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/warplane/7.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/warplane/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/warplane/8.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/warplane/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/warplane/9.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/zombie/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/zombie/0.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/zombie/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/zombie/1.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/zombie/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/zombie/2.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/zombie/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/zombie/3.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/zombie/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/zombie/4.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/zombie/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/zombie/5.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/zombie/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/zombie/6.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/zombie/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/zombie/7.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/zombie/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/zombie/8.jpg


--------------------------------------------------------------------------------
/referenced_images/objects/zombie/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/objects/zombie/9.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/Amongst_the_ruins,_a_leopard_is_roaring.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/Amongst_the_ruins,_a_leopard_is_roaring.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/An_Indian_elephant_adorned_with_a_golden_saddle_paraded_through_the_forest.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/An_Indian_elephant_adorned_with_a_golden_saddle_paraded_through_the_forest.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/The_English_setter_gracefully_pointed_towards_hidden_game_in_the_tall_grass.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/The_English_setter_gracefully_pointed_towards_hidden_game_in_the_tall_grass.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/a_photo_of_consomme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/a_photo_of_consomme.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/a_photo_of_garbage_truck.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/a_photo_of_garbage_truck.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/a_photo_of_hotdog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/a_photo_of_hotdog.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/a_photo_of_laptop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/a_photo_of_laptop.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/a_photo_of_military_uniform.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/a_photo_of_military_uniform.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/a_photo_of_submarine.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/a_photo_of_submarine.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/a_photo_of_tractor.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/a_photo_of_tractor.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/a_photo_of_violin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/a_photo_of_violin.jpg


--------------------------------------------------------------------------------
/referenced_images/style/animation/a_ram_grazed_peacefully_under_the_stars.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/animation/a_ram_grazed_peacefully_under_the_stars.jpg


--------------------------------------------------------------------------------
/referenced_images/style/forbidden_words.txt:
--------------------------------------------------------------------------------
1 | animation
2 | oil painting
3 | sketch
4 | watercolor


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/Amongst_the_ruins,_a_leopard_is_roaring.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/Amongst_the_ruins,_a_leopard_is_roaring.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/An_Indian_elephant_adorned_with_a_golden_saddle_paraded_through_the_forest.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/An_Indian_elephant_adorned_with_a_golden_saddle_paraded_through_the_forest.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/The_English_setter_gracefully_pointed_towards_hidden_game_in_the_tall_grass.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/The_English_setter_gracefully_pointed_towards_hidden_game_in_the_tall_grass.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/a_photo_of_consomme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/a_photo_of_consomme.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/a_photo_of_garbage_truck.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/a_photo_of_garbage_truck.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/a_photo_of_hotdog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/a_photo_of_hotdog.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/a_photo_of_laptop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/a_photo_of_laptop.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/a_photo_of_military_uniform.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/a_photo_of_military_uniform.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/a_photo_of_submarine.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/a_photo_of_submarine.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/a_photo_of_tractor.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/a_photo_of_tractor.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/a_photo_of_violin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/a_photo_of_violin.jpg


--------------------------------------------------------------------------------
/referenced_images/style/oil painting/a_ram_grazed_peacefully_under_the_stars.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/oil painting/a_ram_grazed_peacefully_under_the_stars.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/Amongst_the_ruins,_a_leopard_is_roaring.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/Amongst_the_ruins,_a_leopard_is_roaring.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/An_Indian_elephant_paraded_through_the_forest.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/An_Indian_elephant_paraded_through_the_forest.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/The_English_setter_gracefully_ran_in_the_tall_grass.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/The_English_setter_gracefully_ran_in_the_tall_grass.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/a_photo_of_consomme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/a_photo_of_consomme.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/a_photo_of_garbage_truck.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/a_photo_of_garbage_truck.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/a_photo_of_hotdog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/a_photo_of_hotdog.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/a_photo_of_laptop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/a_photo_of_laptop.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/a_photo_of_military_uniform.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/a_photo_of_military_uniform.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/a_photo_of_submarine.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/a_photo_of_submarine.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/a_photo_of_tractor.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/a_photo_of_tractor.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/a_photo_of_violin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/a_photo_of_violin.jpg


--------------------------------------------------------------------------------
/referenced_images/style/sketch/a_ram_grazed_peacefully_under_the_stars.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/sketch/a_ram_grazed_peacefully_under_the_stars.jpg


--------------------------------------------------------------------------------
/referenced_images/style/target_style.txt:
--------------------------------------------------------------------------------
1 | /home/zcy/attack/diffusion_outputs/style/animation
2 | /home/zcy/attack/diffusion_outputs/style/oil painting
3 | /home/zcy/attack/diffusion_outputs/style/sketch
4 | /home/zcy/attack/diffusion_outputs/style/watercolor


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/Amongst_the_ruins,_a_leopard_is_roaring.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/Amongst_the_ruins,_a_leopard_is_roaring.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/An_Indian_elephant_paraded_through_the_forest.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/An_Indian_elephant_paraded_through_the_forest.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/The_English_setter_gracefully_ran_in_the_tall_grass.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/The_English_setter_gracefully_ran_in_the_tall_grass.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/The_king_snake_slithered_stealthily_through_the_fallen_leaves.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/The_leopard_stealthily_prowled_through_the_moonlit_castle_courtyard.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/The_ram_playfully_butted_heads_with_its_fellow_herd_members.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/a_photo_of_consomme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/a_photo_of_consomme.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/a_photo_of_garbage_truck.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/a_photo_of_garbage_truck.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/a_photo_of_hotdog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/a_photo_of_hotdog.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/a_photo_of_laptop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/a_photo_of_laptop.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/a_photo_of_military_uniform.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/a_photo_of_military_uniform.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/a_photo_of_submarine.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/a_photo_of_submarine.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/a_photo_of_tractor.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/a_photo_of_tractor.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/a_photo_of_violin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/a_photo_of_violin.jpg


--------------------------------------------------------------------------------
/referenced_images/style/watercolor/a_ram_grazed_peacefully_under_the_stars.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datar001/Revealing-Vulnerabilities-in-Stable-Diffusion-via-Targeted-Attacks/21eb35f16697529388a43bf1504089eccc6bc432/referenced_images/style/watercolor/a_ram_grazed_peacefully_under_the_stars.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyTorch == 2.0.1
2 | transformers == 4.23.1
3 | diffusers == 0.11.1


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from PIL import Image
  3 | import os
  4 | import time
  5 | from pos_tagger import word_pos
  6 | import argparse
  7 | from optim_utils import *
  8 | from transformers import CLIPModel, CLIPTokenizer, AutoProcessor
  9 | from modified_clip import Modified_ClipModel
 10 | from get_object_attention_mask import OSModel
 11 | from synonym import Synonym
 12 | import pdb
 13 | 
 14 | def write_log(file, text, print_console=True):
 15 |     file.write(text + "\n")
 16 |     if print_console:
 17 |         print(text)
 18 | 
 19 | def save_top_k_results(outputdir, ori_prompt, prompt_topk):
 20 |     save_file = open(os.path.join(outputdir, ori_prompt + '.txt'), "w")
 21 |     for k, (sim, mse, prompt, token) in enumerate(prompt_topk):
 22 |         if k > len(prompt_topk) - 10:
 23 |             print_console = True
 24 |         else:
 25 |             print_console = False
 26 |         write_log(save_file, "sim: {:.3f}, mse: {:.3f}".format(sim, mse), print_console)
 27 |         write_log(save_file, "prompt: {}".format(prompt), print_console)
 28 |         write_log(save_file, "token: {}".format(token), print_console)
 29 | 
 30 | if __name__ == "__main__":
 31 | 
 32 |     parser = argparse.ArgumentParser()
 33 |     parser.add_argument('--config_path', type=str, required=True, help='experiment configuration')
 34 | 
 35 |     # load args
 36 |     print("Initializing...")
 37 |     args = argparse.Namespace()
 38 |     args.__dict__.update(read_json(parser.parse_args().config_path))
 39 | 
 40 |     # output logger setting
 41 |     output_dir = args.output_dir
 42 |     if os.path.exists(output_dir):
 43 |         replace_type = input("The output path has existed, replace all? (yes/no) ")
 44 |         if replace_type == "no":
 45 |             exit()
 46 |         elif replace_type == "yes":
 47 |             pass
 48 |         else:
 49 |             raise ValueError("Answer must be yes or no")
 50 |     os.makedirs(output_dir, exist_ok=True)
 51 | 
 52 |     # load CLIP model
 53 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 54 |     model = Modified_ClipModel.from_pretrained(args.clip_model).to(device)
 55 |     tokenizer = CLIPTokenizer.from_pretrained(args.clip_model)
 56 |     preprocess = AutoProcessor.from_pretrained(args.clip_model)
 57 | 
 58 |     # load synonym detection model
 59 |     synonym_model = Synonym(word_path="./Word2Vec/", device=device)
 60 | 
 61 |     # load all target goals
 62 |     with open(args.target_path, 'r') as f:
 63 |         target_goals = f.readlines()
 64 |     target_goals = [goal.strip() for goal in target_goals]
 65 | 
 66 |     # load all forbidden words
 67 |     with open(args.forbidden_words, "r") as f:
 68 |         forbidden_words = f.readlines()
 69 |     forbidden_words = [words.strip().split(',') for words in forbidden_words]
 70 | 
 71 |     assert len(target_goals) == len(forbidden_words), "The number of target goals must equal to the number " \
 72 |                                                       f"of forbidden words, but get {len(target_goals)} target goals " \
 73 |                                                       f"and {len(forbidden_words)} forbidden words"
 74 | 
 75 |     # load imagenet-mini label
 76 |     object_path = r"./mini_100.txt"
 77 |     with open(object_path, "r") as f:
 78 |         objects = f.readlines()
 79 |     objects = [obj.strip() for obj in objects]
 80 | 
 81 |     # load word property model
 82 |     word_prop_model = word_pos(model_path="./perceptrontagger_model/averaged_perceptron_tagger.pickle")
 83 |     # load the object decouple model
 84 |     if args.task == "style":
 85 |         object_style_model = OSModel(model, args.clip_model, object_path, device)
 86 |         
 87 |     # load original prompts
 88 |     with open(args.prompt_path, 'r') as f:
 89 |         original_prompts = f.readlines()
 90 | 
 91 |     # training for each attack goal
 92 |     for cur_forbidden_words, goal_path in zip(forbidden_words, target_goals):
 93 |         target_object = goal_path.split('/')[-1]
 94 |         print('\n\tStart to train a new goal: {}\n'.format(target_object))
 95 | 
 96 |         # load the target image
 97 |         orig_images = [Image.open(os.path.join(goal_path, image_name)) for image_name in os.listdir(goal_path)]
 98 |         cur_output_dir = os.path.join(output_dir, target_object)
 99 |         if os.path.exists(cur_output_dir):
100 |             replace_type = input("The adv prompt output path has existed, replace all? (yes/no) ")
101 |             if replace_type == "no":
102 |                 exit()
103 |             elif replace_type == "yes":
104 |                 pass
105 |             else:
106 |                 raise ValueError("Answer must be yes or no")
107 |         os.makedirs(cur_output_dir, exist_ok=True)
108 |         args.cur_output_dir = cur_output_dir
109 | 
110 |         # define the output dir for each target goal
111 |         writer_result = open(os.path.join(cur_output_dir, "results.txt"), "w")
112 |         writer_logger = open(os.path.join(cur_output_dir, "logger.txt"), "w")
113 |         topk_prompt_dir = os.path.join(cur_output_dir, "topk_results")
114 |         os.makedirs(topk_prompt_dir, exist_ok=True)
115 | 
116 |         # print the parameter
117 |         write_log(writer_logger, "======  Current Parameter =======")
118 |         for para in args.__dict__:
119 |             write_log(writer_logger, para + ': ' + str(args.__dict__[para]))
120 | 
121 |         # choose forbidden words
122 |         synonym_words = synonym_model.get_synonym(cur_forbidden_words, k=args.synonym_num)
123 |         for word in cur_forbidden_words:
124 |             if len(word.split()) > 1:
125 |                 cur_forbidden_words.extend(word.split())
126 |         cur_forbidden_words.extend([word[0] for word in synonym_words])
127 |         write_log(writer_result, "the forbidden words is {}".format(cur_forbidden_words))
128 | 
129 |         init_time = time.time()
130 |         for i in range(len(original_prompts)):
131 |             start_time = time.time()
132 |             ori_object = objects[i].lower()
133 |             ori_prompt = original_prompts[i].strip().lower()
134 |             assert ori_object in ori_prompt, "Not match the ori object and the ori prompt, " \
135 |                                              f"obj: {ori_object}, prompt: {ori_prompt}"
136 | 
137 |             if len(args.replace_type) > 0:
138 |                 # replace words
139 |                 input_prompt = word_prop_model.modify_word(ori_prompt, ori_object, pad_word="<|startoftext|>",
140 |                                                            replace_type=args.replace_type)
141 |             else:
142 |                 input_prompt = ori_prompt
143 | 
144 |             if args.task == "style":
145 |                 object_mask, object_ratio = object_style_model.forward(ori_prompt, ori_object, ref_num=10)
146 |                 write_log(writer_logger, f"object ratio is {object_ratio.item()}")
147 |             else:
148 |                 object_mask = None
149 | 
150 |             write_log(writer_logger, "Start to train {}^th object: {}, \n "
151 |                                      "target goal: {},  the prompt: {},  \n the input prompt: {}".format(
152 |                                     i, ori_object, target_object, ori_prompt,
153 |                                     input_prompt + ' ' + ' '.join(["<|startoftext|>"] * args.add_suffix_num)))
154 | 
155 |             assert ori_object in input_prompt, "Not match the ori object and the input prompt"
156 |             learned_prompt, best_sim, prompt_topk = optimize_prompt(
157 |                                                         model, preprocess, tokenizer, args, device,
158 |                                                         ori_prompt=input_prompt,
159 |                                                         target_images=orig_images, forbidden_words=cur_forbidden_words,
160 |                                                         suffix_num=args.add_suffix_num, object_mask=object_mask,
161 |                                                         only_english_words=True)
162 |             end_time = time.time()
163 |             write_log(writer_logger, "The final prompt is {}".format(learned_prompt))
164 |             write_log(writer_logger, "The best sim is {:.3f}".format(best_sim))
165 |             write_log(writer_logger, "Spent time: {:.3f}s".format(end_time-start_time))
166 |             save_top_k_results(topk_prompt_dir, ori_prompt, prompt_topk)
167 |             writer_result.write(learned_prompt + "\n")
168 |         finish_time = time.time()
169 |         all_time = finish_time - init_time
170 |         write_log(writer_logger, "Cong!! Finish the experiment of {}, spent time is {}h{}m".format(
171 |             target_object, all_time//3600, (all_time%3600)//60))
172 | 


--------------------------------------------------------------------------------
/simple_prompt.txt:
--------------------------------------------------------------------------------
  1 | A dalmatian is playfully chasing a ball in the park
  2 | A nematode is burrowing deep in the garden soil
  3 | A unicycle is skillfully balanced by a street performer
  4 | A fishing reel whirs as it casts into the tranquil lake
  5 | An upright piano is serenading a room with classical music
  6 | An Arctic fox is camouflaged against the snowy tundra
  7 | A photocopier is duplicating important documents in the office
  8 | An aircraft carrier is docked in the naval base
  9 | A combination lock secures the entrance to a secret chamber
 10 | An orange is rolling off the kitchen counter
 11 | The cliff offers breathtaking views of the ocean waves below
 12 | A three-toed sloth leisurely hangs from a rainforest tree branch
 13 | The theater curtain rises, revealing the grand stage performance
 14 | A gymnast gracefully swings on the horizontal bar in the gym
 15 | The tile roof glistens under the midday sun
 16 | A dishrag diligently scrubs away dinner's remnants
 17 | A snorkel allows the diver to explore the vibrant coral reef
 18 | The bartender deftly shakes a cocktail shaker for a customer
 19 | A rhinoceros beetle crawls across a fallen leaf
 20 | A trifle dessert is served in a crystal dish
 21 | A prayer rug lies on the floor for daily devotion
 22 | A dugong peacefully grazes on seagrass in the ocean
 23 | A school bus is parked peacefully in the schoolyard
 24 | A slot is placed carefully somewhere secure.
 25 | The organ fills the church with majestic music
 26 | An oboe player performs a haunting melody
 27 | A bookshop is filled with the aroma of new and old books
 28 | An hourglass is measuring the passage of time on the mantelpiece
 29 | A boxer boxer trains rigorously in the gym
 30 | An earring dangles from her delicate earlobe
 31 | The lipstick rests gracefully on the vanity
 32 | A file neatly organizes documents in the drawer
 33 | An electric guitar wails with rock and roll power
 34 | A harvestman crawls on a tree trunk
 35 | The coral reef teems with colorful fish and marine life
 36 | An Ibizan hound races across the open field
 37 | An African hunting dog is runing in the savannah
 38 | A spider web glistens with morning dew
 39 | A missile soars through the sky on a mission
 40 | A holocanthus tricolor gracefully swims amidst coral reefs
 41 | A cuirass protects the knight in shining armor
 42 | The scoreboard displays the game's current score
 43 | A hotdog sizzles on the grill at the barbecue
 44 | A beer bottle is opened to celebrate with friends
 45 | A Walker hound barks excitedly in the forest
 46 | A gong hangs serenely in the meditation room
 47 | A triceratops roams the prehistoric landscape
 48 | A house finch chirps cheerfully from the garden
 49 | A clog adorns the dancer's foot on stage
 50 | A mixing bowl combines ingredients for a delicious recipe
 51 | A toucan perches gracefully on a lush rainforest branch
 52 | A carton rests quietly on the kitchen shelf
 53 | A bolete mushroom thrives beneath the ancient oak tree
 54 | A Tibetan mastiff guards diligently at the monastery entrance
 55 | A Gordon setter romps happily in the sunlit backyard
 56 | A garbage truck collects waste efficiently on suburban streets
 57 | A yawl sails gracefully on the serene blue ocean waters
 58 | A robin perches on a tree branch, singing a cheerful melody
 59 | A vase displays vibrant flowers, brightening the room's decor
 60 | A barrel rolls gently down the brewery's wooden ramp
 61 | A street sign points to the city center
 62 | A goose waddles leisurely by the tranquil pond
 63 | A solar dish harnesses energy from the desert sun
 64 | A malamute sleds through the snowy wilderness with strength
 65 | A consomme simmers gently on the stovetop, filling the kitchen
 66 | A Saluki dog sprints gracefully across the desert sands
 67 | An iPod plays soothing music in the table
 68 | Parallel bars stand sturdy in the gymnastics training room
 69 | A miniature poodle prances joyfully in the park
 70 | A poncho provides warmth during the chilly mountain hike
 71 | An ant carries a tiny leaf through the intricate colony
 72 | A meerkat stands sentinel, alert in the African savannah
 73 | A ladybug rests peacefully on a vibrant wildflower
 74 | A French bulldog naps contentedly on the cozy couch
 75 | A miniskirt twirls gracefully on the dance floor
 76 | A king crab scuttles along the ocean floor, its armor gleaming
 77 | A dome crowns the elegant architecture of the museum
 78 | A golden retriever fetches a ball with boundless enthusiasm
 79 | An ashcan sits quietly at the corner of the street
 80 | A green mamba slithers silently through the dense jungle
 81 | A hair slide adorns her lustrous locks with elegance
 82 | A komondor guards the flock of sheep with vigilance
 83 | A cannon rests stoically on the historic battleground
 84 | A tank maneuvers carefully through the rugged terrain
 85 | A fire screen protects the hearth from dancing embers
 86 | A carousel spins merrily at the amusement park
 87 | A crate holds fresh produce at the bustling market
 88 | A frying pan sizzles with the aroma of breakfast delights
 89 | A stage awaits the performance under the glowing spotlight
 90 | A holster secures the firearm on the police officer's belt
 91 | A tobacco shop showcases pipes and aromatic blends
 92 | A black-footed ferret explores the prairie with curiosity
 93 | A white wolf prowls silently through the snowy forest
 94 | A worm fence meanders through the picturesque countryside
 95 | A jellyfish drifts gracefully in the azure ocean currents
 96 | A wok sizzles with the flavors of an Asian stir-fry
 97 | A Newfoundland dog rescues a swimmer in distress
 98 | A pencil box holds a collection of colorful writing tools
 99 | A lion roars majestically in the heart of the savannah
100 | A catamaran sails swiftly on the tranquil sea, powered by wind


--------------------------------------------------------------------------------
/style_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "add_suffix_num": 0,
 3 |     "replace_type": ["all"],
 4 |     "synonym_num": 10,
 5 |     "iter": 500,
 6 |     "lr": 0.1,
 7 |     "weight_decay": 0.1,
 8 |     "loss_weight": 1.0,
 9 |     "print_step": 100,
10 |     "batch_size": 1,
11 |     "clip_model": "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
12 |     "prompt_path": "simple_prompt.txt",
13 |     "task":"style", 
14 |     "forbidden_words": "./referenced_images/style/forbidden_words.txt",
15 |     "target_path":"./referenced_images/style/target_style.txt",
16 |     "output_dir": "../runs/attack/formal_experiment/style/submit_test"
17 | }


--------------------------------------------------------------------------------
/synonym.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | import re, os
  4 | import pickle
  5 | import pdb
  6 | 
  7 | def isEnglish(s):
  8 |     try:
  9 |         s.encode(encoding='utf-8').decode('ascii')
 10 |     except UnicodeDecodeError:
 11 |         return False
 12 |     else:
 13 |         return True
 14 | 
 15 | def is_english(text):
 16 |     # matching English words using the regular expression
 17 |     pattern = re.compile(r'[^a-zA-Z\s]')
 18 |     if pattern.search(text):
 19 |         return False
 20 |     else:
 21 |         return True
 22 | 
 23 | def get_token_english_mask(vocab_encoder, device):
 24 |     token_num = len(vocab_encoder)
 25 |     mask = torch.ones(token_num).bool()
 26 |     for token, id in vocab_encoder.items():
 27 |         if is_english(token.replace("</w>", "")):
 28 |             mask[id] = 0
 29 |     mask = mask.to(device)
 30 |     return mask
 31 | 
 32 | 
 33 | class Synonym(nn.Module):
 34 |     def __init__(self, word_path, device) -> None:
 35 |         super(Synonym, self).__init__()
 36 |         word2id = pickle.load(open(os.path.join(word_path, "word2id.pkl"), "rb"))
 37 |         wordvec = pickle.load(open(os.path.join(word_path, "wordvec.pkl"), "rb"))
 38 |         self.word2id = word2id
 39 |         self.id2word = {id_:word_ for word_, id_ in self.word2id.items()}
 40 |         self.embedding = torch.from_numpy(wordvec)
 41 |         # normalization
 42 |         self.embedding = self.embedding / self.embedding.norm(dim=1, keepdim=True)
 43 |         self.embedding = self.embedding.to(device)
 44 |         # delete non english words
 45 |         self.delete_non_english()
 46 | 
 47 |     def delete_non_english(self):
 48 |         for word, id in self.word2id.items():
 49 |             if not is_english(word):
 50 |                 self.embedding[id] = 0
 51 | 
 52 |     def transform(self, word, token_unk):
 53 |         if word in self.word2id:
 54 |             return self.embedding[self.word2id[word]]
 55 |         else:
 56 |             if isinstance(token_unk, int):
 57 |                 return self.embedding[token_unk]
 58 |             else:
 59 |                 return self.embedding[self.word2id[token_unk]]
 60 | 
 61 |     def get_synonym(self, words, k=5, word2id=None, embedding=None, id2word=None):
 62 | 
 63 |         word2id = word2id if word2id is not None else self.word2id
 64 |         embedding = embedding if embedding is not None else self.embedding
 65 |         id2word = id2word if id2word is not None else self.id2word
 66 | 
 67 |         if type(words) == str:
 68 |             words = [words]
 69 |         results = []
 70 |         for word in words:
 71 |             if len(word.split()) > 1:
 72 |                 results.extend(self.get_synonym(word.split(), k=k))
 73 |             else:
 74 |                 if word not in word2id:
 75 |                     results.append([word, -1, -1])
 76 |                     continue
 77 |                 word_id = word2id[word]
 78 |                 word_embedding = embedding[word_id]
 79 |                 sims = torch.mm(word_embedding.view(1,-1), embedding.t())
 80 |                 top_k_values, top_k_id = torch.topk(sims, k=k, dim=1, largest=True, sorted=False)
 81 | 
 82 |                 for id, sim in sorted([[id.item(), value.item()] for id, value in zip(top_k_id[0], top_k_values[0])],
 83 |                                       key=lambda x:x[1], reverse=True):
 84 |                     cur_word = id2word[id]
 85 |                     if cur_word != word:
 86 |                         results.append([cur_word, id, sim])
 87 |         return results
 88 | 
 89 |     def get_synonym_by_tokenizer(self, word, tokenizer, k=5):
 90 |         embedding = tokenizer.token_embedding.weight
 91 |         embedding = embedding / embedding.norm(dim=1, keepdim=True)
 92 | 
 93 |         word2id = tokenizer.encoder
 94 |         id2word = tokenizer.decoder
 95 | 
 96 |         return self.get_synonym(word, k, word2id=word2id, embedding=embedding, id2word=id2word)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     device = "cuda" if torch.cuda.is_available() else "cpu"
101 |     synonym_model = Synonym(word_path="./Word2Vec/", device=device)
102 |     pdb.set_trace()
103 |     synonym_words = synonym_model.get_synonym(["ice cream"], k=10)
104 |     forbidden_words = ["ice cream"]
105 |     forbidden_words.extend([word[0] for word in synonym_words])
106 |     print(forbidden_words)
107 | 
108 | 


--------------------------------------------------------------------------------
/test_object_multi.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import argparse
  3 | import torch.nn as nn
  4 | from transformers import CLIPModel, AutoTokenizer, AutoProcessor
  5 | from diffusers import DPMSolverMultistepScheduler, StableDiffusionPipeline
  6 | from classification import ClassificationModel
  7 | import os, json
  8 | import time
  9 | from typing import Any, Mapping
 10 | import numpy as np
 11 | import random
 12 | import pdb
 13 | 
 14 | def read_json(filename: str) -> Mapping[str, Any]:
 15 |     """Returns a Python dict representation of JSON object at input file."""
 16 |     with open(filename) as fp:
 17 |         return json.load(fp)
 18 | 
 19 | def save_image(image, path):
 20 |     dir_path = os.path.dirname(path)
 21 |     os.makedirs(dir_path, exist_ok=True)
 22 |     image.save(path)
 23 | 
 24 | def metric(probs, gt, return_false=False):
 25 |     bs = probs.size(0)
 26 |     max_v, max_index = torch.max(probs, dim=-1)
 27 |     acc = (max_index == gt).sum()
 28 |     if return_false:
 29 |         # pdb.set_trace()
 30 |         false_index = torch.where(max_index != gt)[0]
 31 |         return acc, false_index
 32 |     return acc
 33 | 
 34 | def write_log(file, text):
 35 |     file.write(text + "\n")
 36 |     print(text)
 37 | 
 38 | def set_random_seed(seed=0):
 39 |     torch.manual_seed(seed + 0)
 40 |     torch.cuda.manual_seed(seed + 1)
 41 |     torch.cuda.manual_seed_all(seed + 2)
 42 |     np.random.seed(seed + 3)
 43 |     torch.cuda.manual_seed_all(seed + 4)
 44 |     random.seed(seed + 5)
 45 | 
 46 | if __name__ == "__main__":
 47 |     parser = argparse.ArgumentParser()
 48 |     parser.add_argument('--config_path', type=str, required=True, help='experiment configuration')
 49 | 
 50 |     args = argparse.Namespace()
 51 |     args.__dict__.update(read_json(parser.parse_args().config_path))
 52 |     print("======  Current Parameter =======")
 53 |     for para in args.__dict__:
 54 |         print(para + ': ' + str(args.__dict__[para]))
 55 | 
 56 |     # define the image output dir
 57 |     args.image_output_dir = os.path.join(r"../diffusion_outputs/attack/formal_experiment/",
 58 |                                          args.output_dir.split('formal_experiment/')[-1])
 59 |     print("generated path:{}".format(args.image_output_dir))
 60 |     if os.path.exists(args.image_output_dir):
 61 |         replace_type = input("The image output path has existed, replace all? (yes/no) ")
 62 |         if replace_type == "no":
 63 |             exit()
 64 |         elif replace_type == "yes":
 65 |             pass
 66 |         else:
 67 |             raise ValueError("Answer must be yes or no")
 68 |     os.makedirs(args.image_output_dir, exist_ok=True)
 69 | 
 70 |     # load prompt labels
 71 |     label_path = "./mini_100.txt"
 72 |     with open(label_path, 'r') as f:
 73 |         label_infos = f.readlines()
 74 |     label_infos = [label.lower().strip() for label in label_infos]
 75 | 
 76 |     # load diffusion model
 77 |     # stabilityai/stable-diffusion-2-1-base  # runwayml/stable-diffusion-v1-5
 78 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 79 |     model_id = "stabilityai/stable-diffusion-2-1-base"
 80 |     pipe = StableDiffusionPipeline.from_pretrained(
 81 |         model_id,
 82 |         torch_dtype=torch.float16,
 83 |         revision="fp16",
 84 |     )
 85 |     pipe = pipe.to(device)
 86 |     image_length = 512
 87 | 
 88 |     # load classification model
 89 |     classify_model = ClassificationModel(model_id="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
 90 |                                          label_txt='./mini_100.txt', device=device, mode='object')
 91 |     total_5_acc, total_10_acc, total_acc = 0, 0, 0
 92 |     gen_num = 10
 93 |     batch_size = 5
 94 |     batch = int(np.ceil(gen_num / batch_size))
 95 |     attack_goal_num = 0
 96 |     for object_goal in os.listdir(args.output_dir):
 97 |         object_result_path = os.path.join(args.output_dir, object_goal)
 98 |         if not os.path.isdir(object_result_path):
 99 |             continue
100 |         attack_goal_num += 1
101 |         attack_path = os.path.join(object_result_path, "results.txt")
102 |         with open(attack_path, 'r') as f:
103 |             attack_infos = f.readlines()
104 |         # image output dir
105 |         cur_object_output_dir = os.path.join(args.image_output_dir, object_goal)
106 |         if os.path.exists(cur_object_output_dir):
107 |             replace_type = input("The output acc path has existed!!! replace all? (yes/no) ")
108 |             if replace_type == "no":
109 |                 exit()
110 |             elif replace_type == "yes":
111 |                 pass
112 |             else:
113 |                 raise ValueError("Answer must be yes or no")
114 |         os.makedirs(cur_object_output_dir, exist_ok=True)
115 |         output_file = open(os.path.join(cur_object_output_dir, "results.txt"), "w")
116 | 
117 |         # add the target goal
118 |         if object_goal in classify_model.labels:
119 |             object_goal_id = classify_model.labels.index(object_goal.lower())
120 |         else:
121 |             classify_model.add_label_param([object_goal.lower()])
122 |             object_goal_id = classify_model.labels.index(object_goal.lower())
123 | 
124 |         # generate images
125 |         each_5_acc, each_10_acc, each_acc = 0, 0, 0
126 |         init_time = time.time()
127 |         for i in range(1, len(attack_infos)):
128 |             set_random_seed(666)
129 |             label = label_infos[i-1].strip()
130 |             prompt = attack_infos[i].strip()
131 |             write_log(output_file, "Generate {}^th adv prompt: {}, task: {}, label:{}, attack: {}".format(
132 |                 i, prompt, args.task, label, object_goal))
133 |             assert label.replace("-", " - ") in prompt, "The adversarial prompt don't contain the original object," \
134 |                                                              f"current object: {label}, current prompt: {prompt}"
135 |             cur_5_acc, cur_10_acc, cur_avg_acc = 0, 0, 0
136 |             start_time = time.time()
137 |             for j in range(batch):
138 |                 num_images = min(gen_num, (j+1)*batch_size) - j*batch_size
139 |                 guidance_scale = 9
140 |                 num_inference_steps = 25
141 |                 images = pipe(
142 |                     prompt,
143 |                     num_images_per_prompt=num_images,
144 |                     guidance_scale=guidance_scale,
145 |                     num_inference_steps=num_inference_steps,
146 |                     height=image_length,
147 |                     width=image_length,
148 |                     ).images
149 | 
150 | 
151 |                 probs = classify_model.forward(images)
152 |                 acc_num = metric(probs, object_goal_id)
153 |                 cur_avg_acc += acc_num
154 |                 if j == 0 and acc_num > 0:
155 |                     cur_5_acc = 1
156 |                 if acc_num > 0:
157 |                     cur_10_acc = 1
158 |                 for img_num in range(num_images):
159 |                     sign = probs[img_num].argmax(0) == object_goal_id
160 |                     dir_name = prompt.replace(" ", "_")
161 |                     save_image(images[img_num],
162 |                                os.path.join(cur_object_output_dir,
163 |                                             f"{label}/{dir_name}/{sign}_{img_num+j*batch_size}.png"))
164 |             end_time = time.time()
165 |             each_5_acc += cur_5_acc
166 |             each_10_acc += cur_10_acc
167 |             each_acc += cur_avg_acc
168 |             write_log(output_file, f"{label} acc-5 is: {cur_5_acc}, acc-10 is {cur_10_acc}")
169 |             write_log(output_file, "Spent time: {:.3f}s".format(end_time - start_time))
170 |             write_log(output_file, "The avg acc is {:.3f}".format(cur_avg_acc))
171 |         write_log(output_file, f"\nEnd the testing stage of an attack goal: {object_goal}\n")
172 |         write_log(output_file, "acc-5 is {:.3f}%".format(each_5_acc * 100 / (len(attack_infos) - 1)))
173 |         write_log(output_file, "acc-10 is {:.3f}%".format(each_10_acc * 100 / (len(attack_infos) - 1)))
174 |         write_log(output_file, "acc is {:.3f}%".format(each_acc * 100 / gen_num / (len(attack_infos) - 1)))
175 |         finish_time = time.time()
176 |         all_time = finish_time - init_time
177 |         write_log(output_file,
178 |                   "spent time is {}h{}m".format(all_time // 3600, (all_time % 3600) // 60))
179 |         total_5_acc += each_5_acc * 100 / (len(attack_infos) - 1)
180 |         total_10_acc += each_10_acc * 100 / (len(attack_infos) - 1)
181 |         total_acc += each_acc * 100 / gen_num / (len(attack_infos) - 1)
182 |     total_result_path = os.path.join(args.image_output_dir, "results.txt")
183 |     output_file = open(total_result_path, "w")
184 |     write_log(output_file, f"\nEnd the all testing stage\n")
185 |     write_log(output_file, "Final acc-5 is {:.3f}%".format(total_5_acc / attack_goal_num))
186 |     write_log(output_file, "Final acc-10 is {:.3f}%".format(total_10_acc / attack_goal_num))
187 |     write_log(output_file, "Final acc is {:.3f}%".format(total_acc / attack_goal_num))


--------------------------------------------------------------------------------
/test_style_multi.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import argparse
  3 | import torch.nn as nn
  4 | from transformers import CLIPModel, AutoTokenizer, AutoProcessor
  5 | from diffusers import DPMSolverMultistepScheduler, StableDiffusionPipeline
  6 | from classification import ClassificationModel
  7 | import os, json
  8 | import time
  9 | from typing import Any, Mapping
 10 | import numpy as np
 11 | import random
 12 | import pdb
 13 | 
 14 | 
 15 | def read_json(filename: str) -> Mapping[str, Any]:
 16 |     """Returns a Python dict representation of JSON object at input file."""
 17 |     with open(filename) as fp:
 18 |         return json.load(fp)
 19 | 
 20 | 
 21 | def save_image(image, path):
 22 |     dir_path = os.path.dirname(path)
 23 |     os.makedirs(dir_path, exist_ok=True)
 24 |     image.save(path)
 25 | 
 26 | 
 27 | def metric(probs, gt, return_false=False):
 28 |     bs = probs.size(0)
 29 |     max_v, max_index = torch.max(probs, dim=-1)
 30 |     acc = (max_index == gt).sum()
 31 |     if return_false:
 32 |         # pdb.set_trace()
 33 |         false_index = torch.where(max_index != gt)[0]
 34 |         return acc, false_index
 35 |     return acc
 36 | 
 37 | 
 38 | def write_log(file, text):
 39 |     file.write(text + "\n")
 40 |     print(text)
 41 | 
 42 | def set_random_seed(seed=0):
 43 |     torch.manual_seed(seed + 0)
 44 |     torch.cuda.manual_seed(seed + 1)
 45 |     torch.cuda.manual_seed_all(seed + 2)
 46 |     np.random.seed(seed + 3)
 47 |     torch.cuda.manual_seed_all(seed + 4)
 48 |     random.seed(seed + 5)
 49 | 
 50 | if __name__ == "__main__":
 51 |     parser = argparse.ArgumentParser()
 52 |     parser.add_argument('--config_path', type=str, required=True, help='experiment configuration')
 53 | 
 54 |     args = argparse.Namespace()
 55 |     args.__dict__.update(read_json(parser.parse_args().config_path))
 56 |     print("======  Current Parameter =======")
 57 |     for para in args.__dict__:
 58 |         print(para + ': ' + str(args.__dict__[para]))
 59 | 
 60 |     # define the image output dir
 61 |     args.image_output_dir = os.path.join(r"../diffusion_outputs/attack/formal_experiment/",
 62 |                                          args.output_dir.split('formal_experiment/')[-1])
 63 |     print("generated path:{}".format(args.image_output_dir))
 64 |     if os.path.exists(args.image_output_dir):
 65 |         replace_type = input("The image output path has existed, replace all? (yes/no) ")
 66 |         if replace_type == "no":
 67 |             exit()
 68 |         elif replace_type == "yes":
 69 |             pass
 70 |         else:
 71 |             raise ValueError("Answer must be yes or no")
 72 |     os.makedirs(args.image_output_dir, exist_ok=True)
 73 | 
 74 |     # load attack prompt
 75 |     label_infos = ["oil painting", "watercolor", "sketch", "animation", "photorealistic"]
 76 | 
 77 |     # load diffusion model
 78 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 79 |     # stabilityai/stable-diffusion-2-1-base  # runwayml/stable-diffusion-v1-5
 80 |     model_id = "stabilityai/stable-diffusion-2-1-base"
 81 |     pipe = StableDiffusionPipeline.from_pretrained(
 82 |         model_id,
 83 |         torch_dtype=torch.float16,
 84 |         revision="fp16",
 85 |     )
 86 |     pipe = pipe.to(device)
 87 |     image_length = 512
 88 | 
 89 |     # load style classification model
 90 |     style_classify_model = ClassificationModel(model_id="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
 91 |                                                label_txt=label_infos, device=device, mode='style')
 92 | 
 93 |     # load original prompt
 94 |     with open(args.prompt_path, 'r') as f:
 95 |         ori_prompts = f.readlines()
 96 | 
 97 |     # load the prompt label
 98 |     object_path = "./mini_100.txt"
 99 |     with open(object_path, 'r') as f:
100 |         object_infos = f.readlines()
101 | 
102 |     # load object classification model
103 |     object_classify_model = ClassificationModel(model_id="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
104 |                                                 label_txt=object_infos, device=device, mode='object')
105 |     total_5_acc_style, total_10_acc_style, total_acc_style = 0, 0, 0
106 |     total_5_acc_obj, total_10_acc_obj, total_acc_obj = 0, 0, 0
107 |     gen_num = 10
108 |     batch_size = 5
109 |     batch = int(np.ceil(gen_num / batch_size))
110 |     attack_goal_num = 0
111 |     for style_goal in os.listdir(args.output_dir):
112 |         print("\n Start to generate images of {}\n".format(style_goal))
113 |         style_result_path = os.path.join(args.output_dir, style_goal)
114 |         if not os.path.isdir(style_result_path):
115 |             continue
116 |         attack_goal_num += 1
117 |         attack_path = os.path.join(style_result_path, "results.txt")
118 |         with open(attack_path, 'r') as f:
119 |             attack_infos = f.readlines()
120 | 
121 |         cur_style_output_dir = os.path.join(args.image_output_dir, style_goal)
122 |         if os.path.exists(cur_style_output_dir):
123 |             replace_type = input("The output acc path has existed!!! replace all? (yes/no) ")
124 |             if replace_type == "no":
125 |                 exit()
126 |             elif replace_type == "yes":
127 |                 pass
128 |             else:
129 |                 raise ValueError("Answer must be yes or no")
130 |         os.makedirs(cur_style_output_dir, exist_ok=True)
131 |         output_file = open(os.path.join(cur_style_output_dir, "results.txt"), "w")
132 | 
133 |         # load style goal
134 |         if style_goal in style_classify_model.labels:
135 |             style_label = style_classify_model.labels.index(style_goal)
136 |         else:
137 |             style_classify_model.add_label_param([style_goal])
138 |             style_label = len(style_classify_model.labels)
139 | 
140 |         # generate images
141 |         each_5_acc_style, each_10_acc_style, each_acc_style = 0, 0, 0
142 |         each_5_acc_obj, each_10_acc_obj, each_acc_obj = 0, 0, 0
143 |         init_time = time.time()
144 |         for i in range(1, len(attack_infos)):
145 |             # set_random_seed(666)  # fix the seed between the original image and the adversarial image
146 |             # ori_prompt = ori_prompts[i-1]
147 |             # write_log(output_file, f"Generate {i-1}^th ori_prompt: {ori_prompt}")
148 |             # for j in range(batch):
149 |             #     num_images = min(100, (j+1)*batch_size) - j*batch_size
150 |             #     guidance_scale = 9
151 |             #     num_inference_steps = 25
152 |             #     images = pipe(
153 |             #         ori_prompt,
154 |             #         num_images_per_prompt=num_images,
155 |             #         guidance_scale=guidance_scale,
156 |             #         num_inference_steps=num_inference_steps,
157 |             #         height=image_length,
158 |             #         width=image_length,
159 |             #     ).images
160 |             #     for img_num in range(num_images):
161 |             #         save_image(images[img_num],
162 |             #                    os.path.join(args.image_output_dir, f"original/{ori_prompt}/{img_num + j * 10}.png"))
163 | 
164 |             set_random_seed(666)  # fix the seed between the original image and the adversarial image
165 |             prompt = attack_infos[i].strip()
166 |             object_label = i - 1
167 |             tar_object = object_infos[object_label].strip().lower()
168 |             assert tar_object.replace("-", " - ") in prompt, "The adversarial prompt don't contain the original object," \
169 |                                                          f"current object: {tar_object}, current prompt: {prompt}"
170 |             cur_5_acc_style, cur_10_acc_style, cur_avg_acc_style = 0, 0, 0
171 |             cur_5_acc_obj, cur_10_acc_obj, cur_avg_acc_obj = 0, 0, 0
172 |             start_time = time.time()
173 |             write_log(output_file, f"Generate {i}^th adv prompt: {prompt}, label: {tar_object}, attack: {style_goal}")
174 |             for j in range(batch):
175 |                 num_images = min(gen_num, (j+1)*batch_size) - j*batch_size
176 |                 guidance_scale = 9
177 |                 num_inference_steps = 25
178 |                 images = pipe(
179 |                     prompt,
180 |                     num_images_per_prompt=num_images,
181 |                     guidance_scale=guidance_scale,
182 |                     num_inference_steps=num_inference_steps,
183 |                     height=image_length,
184 |                     width=image_length,
185 |                 ).images
186 | 
187 |                 # style acc
188 |                 probs_style = style_classify_model.forward(images)
189 |                 acc_num = metric(probs_style, style_label)
190 |                 cur_avg_acc_style += acc_num
191 |                 if (j+1) * batch_size <= 5 and acc_num > 0:
192 |                     cur_5_acc_style = 1
193 |                 if acc_num > 0:
194 |                     cur_10_acc_style = 1
195 | 
196 |                 # obj acc
197 |                 probs_obj = object_classify_model.forward(images)
198 |                 acc_num = metric(probs_obj, object_label)
199 |                 cur_avg_acc_obj += acc_num
200 |                 if (j+1) * batch_size <= 5 and acc_num > 0:
201 |                     cur_5_acc_obj = 1
202 |                 if acc_num > 0:
203 |                     cur_10_acc_obj = 1
204 | 
205 |                 for img_num in range(num_images):
206 |                     sign = probs_style[img_num].argmax(0) == style_label
207 |                     sign &= probs_obj[img_num].argmax(0) == object_label
208 |                     save_image(images[img_num],
209 |                                os.path.join(cur_style_output_dir,
210 |                                 f"{tar_object}/{prompt}/{sign}_{img_num + j * batch_size}.png"))
211 |             end_time = time.time()
212 |             each_5_acc_style += cur_5_acc_style
213 |             each_10_acc_style += cur_10_acc_style
214 |             each_acc_style += cur_avg_acc_style
215 |             each_5_acc_obj += cur_5_acc_obj
216 |             each_10_acc_obj += cur_10_acc_obj
217 |             each_acc_obj += cur_avg_acc_obj
218 |             write_log(output_file,
219 |                       f"{prompt} 5_acc_style is: {cur_5_acc_style}, 10_acc_style is {cur_10_acc_style}, "
220 |                       f"avg_acc_style is {cur_avg_acc_style}")
221 |             write_log(output_file,
222 |                       f"{prompt} 5_acc_obj is: {cur_5_acc_obj}, 10_acc_obj is {cur_10_acc_obj}, "
223 |                       f"avg_acc_obj is {cur_avg_acc_obj}")
224 |             write_log(output_file, "Spent time: {:.3f}s".format(end_time - start_time))
225 |         write_log(output_file, "\nEnd the testing stage\n")
226 |         write_log(output_file, "style acc-5 is {:.3f}%".format(each_5_acc_style * 100 / (len(attack_infos) - 1)))
227 |         write_log(output_file, "style acc-10 is {:.3f}%".format(each_10_acc_style * 100 / (len(attack_infos) - 1)))
228 |         write_log(output_file, "style acc is {:.3f}%".format(each_acc_style * 100 / gen_num / (len(attack_infos) - 1)))
229 |         write_log(output_file, "obj acc-5 is {:.3f}%".format(each_5_acc_obj * 100 / (len(attack_infos) - 1)))
230 |         write_log(output_file, "obj acc-10 is {:.3f}%".format(each_10_acc_obj * 100 / (len(attack_infos) - 1)))
231 |         write_log(output_file, "obj acc is {:.3f}%".format(each_acc_obj * 100 / gen_num / (len(attack_infos) - 1)))
232 |         finish_time = time.time()
233 |         all_time = finish_time - init_time
234 |         write_log(output_file,
235 |                   "spent time is {}h{}m\n".format(
236 |                       all_time // 3600, (all_time % 3600) // 60))
237 |         total_5_acc_style += each_5_acc_style * 100 / (len(attack_infos) - 1)
238 |         total_10_acc_style += each_10_acc_style * 100 / (len(attack_infos) - 1)
239 |         total_acc_style += each_acc_style * 100 / gen_num / (len(attack_infos) - 1)
240 |         total_5_acc_obj += each_5_acc_obj * 100 / (len(attack_infos) - 1)
241 |         total_10_acc_obj += each_10_acc_obj * 100 / (len(attack_infos) - 1)
242 |         total_acc_obj += each_acc_obj * 100 / gen_num / (len(attack_infos) - 1)
243 |     total_result_path = os.path.join(args.image_output_dir, "results.txt")
244 |     output_file = open(total_result_path, "w")
245 |     write_log(output_file, "Finish All Testing")
246 |     write_log(output_file, "Final style acc-5  is {:.3f}%".format(total_5_acc_style / attack_goal_num))
247 |     write_log(output_file, "Final style acc-10 is {:.3f}%".format(total_10_acc_style / attack_goal_num))
248 |     write_log(output_file, "Final style acc is {:.3f}%".format(total_acc_style / attack_goal_num))
249 |     write_log(output_file, "Final obj acc-5 is {:.3f}%".format(total_5_acc_obj / attack_goal_num))
250 |     write_log(output_file, "Final obj acc-10 is {:.3f}%".format(total_10_acc_obj / attack_goal_num))
251 |     write_log(output_file, "Final obj acc is {:.3f}%".format(total_acc_obj / attack_goal_num))


--------------------------------------------------------------------------------