🐦{title}

├── 20B_tokenizer.json
├── Blip2RWKV
    ├── README.md
    ├── blip2zh_qformer.py
    ├── configuration_blip2rwkv.py
    ├── configuration_rwkv.py
    ├── gen.png
    ├── modeling_blip2rwkv.py
    └── modeling_rwkv.py
├── LICENSE
├── Language
    └── README.md
├── README.md
├── Visual
    └── README.md
├── app.py
├── assets
    ├── MiniRWKV-4 Demo1.png
    ├── MiniRWKV-4 Demo2.png
    ├── MiniRWKV-4 Demo3.png
    ├── MiniRWKV-4 Demo4.png
    ├── MiniRWKV-4 Demo5.png
    ├── README.md
    ├── demo.jpg
    └── gen.png
├── config
    ├── README.md
    └── minirwkv4.yaml
├── minirwkv4
    ├── README.md
    ├── blipcaption.py
    ├── blipvqa.py
    ├── gen.png
    ├── vitgptcaption.py
    └── vitvqa.py
└── prompts
    ├── README.md
    ├── cardiogenic.yaml
    ├── exogenous.yaml
    └── operability.yaml


/Blip2RWKV/README.md:
--------------------------------------------------------------------------------
1 | modeling_blip2rwkv.py 为主要代码，基于BLIP2，构建了RWKV的QFormer <br>
2 | 
3 | RWKV使用了https://huggingface.co/StarRing2022/RWKV-4-Raven-3B-v11-zh <br>
4 | BERT和RWKV均使用中文 <br>
5 | 
6 | 当前，更多是一个图像的RWKV语言编码器 <br>
7 | 


--------------------------------------------------------------------------------
/Blip2RWKV/blip2zh_qformer.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoConfig, AutoModel
  2 | import torch
  3 | from torch import nn
  4 | import logging
  5 | from torch.cuda.amp import autocast as autocast
  6 | from lavis.common.registry import registry
  7 | from lavis.common.dist_utils import get_rank
  8 | 
  9 | from torch.nn import functional as F
 10 | 
 11 | from lavis.models.base_model import all_gather_with_grad, concat_all_gather
 12 | from lavis.models.blip2_models.blip2 import (
 13 |     Blip2Base,
 14 |     compute_sim_matrix,
 15 |     disabled_train,
 16 | )
 17 | from lavis.models.blip2_models.Qformer import BertLMHeadModel
 18 | from lavis.models.blip_models.blip_outputs import BlipOutput, BlipOutputFeatures
 19 | 
 20 | 
 21 | ZH_BERT = "./bert-base-chinese"
 22 | 
 23 | 
 24 | class Blip2BaseZh(Blip2Base):
 25 |     @classmethod
 26 |     def init_tokenizer(cls):
 27 |         tokenizer = AutoTokenizer.from_pretrained(ZH_BERT)
 28 |         tokenizer.add_special_tokens({"bos_token": "[DEC]"})
 29 |         return tokenizer
 30 | 
 31 |     @classmethod
 32 |     def init_Qformer(cls, num_query_token, vision_width, cross_attention_freq=2):
 33 |         encoder_config = AutoConfig.from_pretrained(ZH_BERT)
 34 |         encoder_config.encoder_width = vision_width
 35 |         # insert cross-attention layer every other block
 36 |         encoder_config.add_cross_attention = True
 37 |         encoder_config.cross_attention_freq = cross_attention_freq
 38 |         encoder_config.query_length = num_query_token
 39 |         # BertLMHeadModel
 40 |         Qformer = BertLMHeadModel.from_pretrained(
 41 |             ZH_BERT, config=encoder_config
 42 |         )
 43 |         query_tokens = nn.Parameter(
 44 |             torch.zeros(1, num_query_token, encoder_config.hidden_size)
 45 |         )
 46 |         query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
 47 |         return Qformer, query_tokens
 48 | 
 49 | 
 50 | @registry.register_model("blip2zh")
 51 | class Blip2ZhQformer(Blip2BaseZh):
 52 |     """
 53 |     BLIP2 first-stage model with Q-former and ViT.
 54 |     Supported model types:
 55 |         - pretrained: pretrained model with vit-g
 56 |         - pretrain_vitL: pretrained model with vit-large
 57 |         - coco: fintuned model on coco
 58 |     Usage:
 59 |         >>> from lavis.models import load_model
 60 |         >>> model = load_model("blip2", "pretrain")
 61 |     """
 62 | 
 63 |     def __init__(
 64 |         self,
 65 |         vit_model="eva_clip_g",
 66 |         img_size=224,
 67 |         drop_path_rate=0,
 68 |         use_grad_checkpoint=False,
 69 |         vit_precision="fp16",
 70 |         freeze_vit=True,
 71 |         num_query_token=32,
 72 |         cross_attention_freq=2,
 73 |         embed_dim=256,
 74 |         max_txt_len=32,
 75 |     ):
 76 |         super().__init__()
 77 | 
 78 |         self.tokenizer = self.init_tokenizer()
 79 | 
 80 |         self.visual_encoder, self.ln_vision = self.init_vision_encoder(
 81 |             vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
 82 |         )
 83 |         if freeze_vit:
 84 |             for name, param in self.visual_encoder.named_parameters():
 85 |                 param.requires_grad = False
 86 |             self.visual_encoder = self.visual_encoder.eval()
 87 |             self.visual_encoder.train = disabled_train
 88 |             logging.info("freeze vision encoder")
 89 |         self.Qformer, self.query_tokens = self.init_Qformer(
 90 |             num_query_token, self.visual_encoder.num_features, cross_attention_freq
 91 |         )
 92 |         self.Qformer.resize_token_embeddings(len(self.tokenizer))
 93 |         state_dict = self.Qformer.state_dict()
 94 |         for name, param in self.Qformer.named_parameters():
 95 |             if "_query" in name:
 96 |                 key_orig = name.replace("_query", "")
 97 |                 param.data.copy_(state_dict[key_orig])
 98 | 
 99 |         self.vision_proj = nn.Linear(self.Qformer.config.hidden_size, embed_dim)
100 |         self.text_proj = nn.Linear(self.Qformer.config.hidden_size, embed_dim)
101 | 
102 |         self.itm_head = nn.Linear(self.Qformer.config.hidden_size, 2)
103 | 
104 |         self.temp = nn.Parameter(0.07 * torch.ones([]))
105 | 
106 |         self.max_txt_len = max_txt_len
107 | 
108 |     def forward(self, samples):
109 |         image = samples["image"]
110 |         text = samples["text_input"]
111 | 
112 |         with torch.no_grad():
113 |             image_embeds = self.ln_vision(self.visual_encoder(image))
114 |             image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
115 |                 image.device
116 |             )
117 | 
118 |         query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
119 | 
120 |         query_output = self.Qformer.bert(
121 |             query_embeds=query_tokens,
122 |             encoder_hidden_states=image_embeds,
123 |             encoder_attention_mask=image_atts,
124 |             use_cache=True,
125 |             return_dict=True,
126 |         )
127 | 
128 |         image_feats = F.normalize(
129 |             self.vision_proj(query_output.last_hidden_state), dim=-1
130 |         )
131 | 
132 |         text_tokens = self.tokenizer(
133 |             text,
134 |             padding="max_length",
135 |             truncation=True,
136 |             max_length=self.max_txt_len,
137 |             return_tensors="pt",
138 |         ).to(image.device)
139 |         text_output = self.Qformer.bert(
140 |             text_tokens.input_ids,
141 |             attention_mask=text_tokens.attention_mask,
142 |             return_dict=True,
143 |         )
144 |         text_feat = F.normalize(
145 |             self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1
146 |         )
147 | 
148 |         ###============== Image-text Contrastive ===================###
149 |         image_feats_all = concat_all_gather(
150 |             image_feats
151 |         )  # [batch_size*num_gpu, num_query_tokens, embed_dim]
152 |         text_feat_all = concat_all_gather(text_feat)  # [batch_size*num_gpu, embed_dim]
153 | 
154 |         sim_q2t = torch.matmul(
155 |             image_feats.unsqueeze(1), text_feat_all.unsqueeze(-1)
156 |         ).squeeze()
157 |         # [batch_size, batch_size*num_gpu, num_query_tokens]
158 | 
159 |         # image-text similarity: aggregate across all query tokens
160 |         sim_i2t, _ = sim_q2t.max(-1)
161 |         sim_i2t = sim_i2t / self.temp
162 | 
163 |         # text-query similarity: [batch_size, batch_size*num_gpu, num_query_tokens]
164 |         sim_t2q = torch.matmul(
165 |             text_feat.unsqueeze(1).unsqueeze(1), image_feats_all.permute(0, 2, 1)
166 |         ).squeeze()
167 | 
168 |         # text-image similarity: aggregate across all query tokens
169 |         sim_t2i, _ = sim_t2q.max(-1)
170 |         sim_t2i = sim_t2i / self.temp  # [batch_size, batch_size*num_gpu]
171 | 
172 |         rank = get_rank()
173 |         bs = image.size(0)
174 |         targets = torch.linspace(rank * bs, rank * bs + bs - 1, bs, dtype=int).to(
175 |             image.device
176 |         )
177 | 
178 |         loss_itc = (
179 |             F.cross_entropy(sim_i2t, targets, label_smoothing=0.1)
180 |             + F.cross_entropy(sim_t2i, targets, label_smoothing=0.1)
181 |         ) / 2
182 | 
183 |         ###============== Image-text Matching ===================###
184 |         text_input_ids_world = concat_all_gather(text_tokens.input_ids)
185 |         text_attention_mask_world = concat_all_gather(text_tokens.attention_mask)
186 |         image_embeds_world = all_gather_with_grad(image_embeds)
187 |         with torch.no_grad():
188 |             weights_t2i = F.softmax(sim_t2i, dim=1) + 1e-4
189 |             weights_t2i[:, rank * bs : rank * bs + bs].fill_diagonal_(0)
190 |             weights_i2t = F.softmax(sim_i2t, dim=1) + 1e-4
191 |             weights_i2t[:, rank * bs : rank * bs + bs].fill_diagonal_(0)
192 | 
193 |         # select a negative image for each text
194 |         image_embeds_neg = []
195 |         for b in range(bs):
196 |             neg_idx = torch.multinomial(weights_t2i[b], 1).item()
197 |             image_embeds_neg.append(image_embeds_world[neg_idx])
198 |         image_embeds_neg = torch.stack(image_embeds_neg, dim=0)
199 | 
200 |         # select a negative text for each image
201 |         text_ids_neg = []
202 |         text_atts_neg = []
203 |         for b in range(bs):
204 |             neg_idx = torch.multinomial(weights_i2t[b], 1).item()
205 |             text_ids_neg.append(text_input_ids_world[neg_idx])
206 |             text_atts_neg.append(text_attention_mask_world[neg_idx])
207 | 
208 |         text_ids_neg = torch.stack(text_ids_neg, dim=0)
209 |         text_atts_neg = torch.stack(text_atts_neg, dim=0)
210 | 
211 |         text_ids_all = torch.cat(
212 |             [text_tokens.input_ids, text_tokens.input_ids, text_ids_neg], dim=0
213 |         )  # pos, pos, neg
214 |         text_atts_all = torch.cat(
215 |             [text_tokens.attention_mask, text_tokens.attention_mask, text_atts_neg],
216 |             dim=0,
217 |         )
218 | 
219 |         query_tokens_itm = self.query_tokens.expand(text_ids_all.shape[0], -1, -1)
220 |         query_atts_itm = torch.ones(query_tokens_itm.size()[:-1], dtype=torch.long).to(
221 |             image.device
222 |         )
223 |         attention_mask_all = torch.cat([query_atts_itm, text_atts_all], dim=1)
224 | 
225 |         image_embeds_all = torch.cat(
226 |             [image_embeds, image_embeds_neg, image_embeds], dim=0
227 |         )  # pos, neg, pos
228 |         image_atts_all = torch.ones(image_embeds_all.size()[:-1], dtype=torch.long).to(
229 |             image.device
230 |         )
231 | 
232 |         output_itm = self.Qformer.bert(
233 |             text_ids_all,
234 |             query_embeds=query_tokens_itm,
235 |             attention_mask=attention_mask_all,
236 |             encoder_hidden_states=image_embeds_all,
237 |             encoder_attention_mask=image_atts_all,
238 |             return_dict=True,
239 |         )
240 | 
241 |         vl_embeddings = output_itm.last_hidden_state[:, : query_tokens_itm.size(1), :]
242 |         vl_output = self.itm_head(vl_embeddings)
243 |         logits = vl_output.mean(dim=1)
244 | 
245 |         itm_labels = torch.cat(
246 |             [torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)],
247 |             dim=0,
248 |         ).to(image.device)
249 |         loss_itm = F.cross_entropy(logits, itm_labels)
250 | 
251 |         ##================= Image Captioning ========================##
252 |         decoder_input_ids = text_tokens.input_ids.clone()
253 |         decoder_input_ids[:, 0] = self.tokenizer.bos_token_id
254 |         labels = decoder_input_ids.masked_fill(
255 |             decoder_input_ids == self.tokenizer.pad_token_id, -100
256 |         )
257 | 
258 |         query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(
259 |             image.device
260 |         )
261 |         attention_mask = torch.cat([query_atts, text_tokens.attention_mask], dim=1)
262 |         lm_output = self.Qformer(
263 |             decoder_input_ids,
264 |             attention_mask=attention_mask,
265 |             past_key_values=query_output.past_key_values,
266 |             return_dict=True,
267 |             labels=labels,
268 |         )
269 | 
270 |         loss_lm = lm_output.loss
271 | 
272 |         return BlipOutput(
273 |             loss=loss_itc + loss_itm + loss_lm,
274 |             loss_itc=loss_itc,
275 |             loss_itm=loss_itm,
276 |             loss_lm=loss_lm,
277 |         )
278 | 
279 |     @torch.no_grad()
280 |     def generate(
281 |         self,
282 |         samples,
283 |         use_nucleus_sampling=False,
284 |         num_beams=3,
285 |         max_length=30,
286 |         min_length=10,
287 |         top_p=0.9,
288 |         repetition_penalty=1.0,
289 |     ):
290 |         """
291 |         Args:
292 |             samples (dict): A dictionary containing the following keys:
293 |                 - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W)
294 |             use_nucleus_sampling (bool): Whether to use nucleus sampling. If False, use top-k sampling.
295 |             num_beams (int): Number of beams for beam search. 1 means no beam search.
296 |             max_length (int): The maximum length of the sequence to be generated.
297 |             min_length (int): The minimum length of the sequence to be generated.
298 |             top_p (float): The cumulative probability for nucleus sampling.
299 |             repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty.
300 |             num_captions (int): Number of captions to be generated for each image.
301 |         Returns:
302 |             captions (list): A list of strings of length batch_size * num_captions.
303 |         """
304 |         image = samples["image"]
305 |         image_embeds = self.ln_vision(self.visual_encoder(image))
306 | 
307 |         if not use_nucleus_sampling:
308 |             image_embeds = image_embeds.repeat_interleave(num_beams, dim=0)
309 |         else:
310 |             num_beams = 1
311 |         image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
312 |             image.device
313 |         )
314 | 
315 |         model_kwargs = {
316 |             "encoder_hidden_states": image_embeds,
317 |             "encoder_attention_mask": image_atts,
318 |         }
319 | 
320 |         input_ids = (
321 |             torch.LongTensor(image.size(0), 1)
322 |             .fill_(self.tokenizer.bos_token_id)
323 |             .to(image.device)
324 |         )
325 |         query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
326 | 
327 |         outputs = self.Qformer.generate(
328 |             input_ids=input_ids,
329 |             query_embeds=query_tokens,
330 |             max_length=max_length,
331 |             min_length=min_length,
332 |             num_beams=num_beams,
333 |             do_sample=use_nucleus_sampling,
334 |             top_p=top_p,
335 |             eos_token_id=self.tokenizer.sep_token_id,
336 |             pad_token_id=self.tokenizer.pad_token_id,
337 |             **model_kwargs
338 |         )
339 |         captions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
340 |         return captions
341 | 
342 |     def forward_image(self, image):
343 |         image_embeds = self.ln_vision(self.visual_encoder(image))
344 |         image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
345 |             image.device
346 |         )
347 | 
348 |         query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
349 | 
350 |         query_output = self.Qformer.bert(
351 |             query_embeds=query_tokens,
352 |             encoder_hidden_states=image_embeds,
353 |             encoder_attention_mask=image_atts,
354 |             return_dict=True,
355 |         )
356 |         return query_output.last_hidden_state, image_embeds
357 | 
358 |     def forward_text(self, text_tokens):
359 |         text_output = self.Qformer.bert(
360 |             text_tokens.input_ids,
361 |             attention_mask=text_tokens.attention_mask,
362 |             return_dict=True,
363 |         )
364 |         return text_output.last_hidden_state[:, 0, :]
365 | 
366 |     def compute_itm(self, image_inputs, text_ids, text_atts):
367 |         image_atts = torch.ones(image_inputs.size()[:-1], dtype=torch.long).to(
368 |             image_inputs.device
369 |         )
370 |         query_tokens = self.query_tokens.expand(image_inputs.shape[0], -1, -1)
371 |         query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(
372 |             image_inputs.device
373 |         )
374 |         attention_mask = torch.cat([query_atts, text_atts], dim=1)
375 |         output_itm = self.Qformer.bert(
376 |             text_ids,
377 |             query_embeds=query_tokens,
378 |             attention_mask=attention_mask,
379 |             encoder_hidden_states=image_inputs,
380 |             encoder_attention_mask=image_atts,
381 |             return_dict=True,
382 |         )
383 |         vl_embeddings = output_itm.last_hidden_state[:, : query_tokens.size(1), :]
384 |         itm_logit = self.itm_head(vl_embeddings)
385 |         itm_logit = itm_logit[:, :, 1].mean(dim=1)
386 |         return itm_logit
387 | 
388 |     @torch.no_grad()
389 |     def extract_features(self, samples, mode="multimodal"):
390 |         """
391 |         Extract features for multimodal or unimodal samples.
392 |         Args:
393 |             samples (dict): A dictionary of samples, containing the following keys:
394 |                 - image (torch.Tensor): A tensor of shape (B, C, H, W) containing the image.
395 |                     Raw images should be preprocessed before being passed to feature extractor.
396 |                 - text_input (list): A list of strings containing the text, length B.
397 |             mode (str): The mode of feature extraction. Can be either "multimodal", "text" or "image".
398 |                 If "multimodal", return image features and multimodal features;
399 |                 if "text", return text features;
400 |                 if "image", return image features.
401 |                 Default: "multimodal".
402 |         Returns:
403 |             BlipOutputFeatures: A BlipOutputFeatures object containing the features.
404 |                 See lavis/models/blip_models/blip_outputs.py for more details.
405 |         """
406 |         image = samples.get("image")
407 |         caption = samples.get("text_input")
408 | 
409 |         # assert mode is one of "image", "text", "multimodal"
410 |         assert mode in [
411 |             "image",
412 |             "text",
413 |             "multimodal",
414 |         ], "mode must be one of 'image', 'text', 'multimodal'"
415 | 
416 |         # initalize output
417 |         image_embeds, text_embeds, multimodal_embeds = None, None, None
418 |         image_features, text_features = None, None
419 | 
420 |         if mode == "image":
421 |             assert (
422 |                 image is not None
423 |             ), "Image is not provided for mode 'image' or 'multimodal'"
424 |             # return query features
425 |             with self.maybe_autocast():
426 |                 image_embeds_frozen = self.ln_vision(self.visual_encoder(image))
427 |             image_embeds_frozen = image_embeds_frozen.float()
428 |             image_atts = torch.ones(
429 |                 image_embeds_frozen.size()[:-1], dtype=torch.long
430 |             ).to(self.device)
431 |             query_tokens = self.query_tokens.expand(
432 |                 image_embeds_frozen.shape[0], -1, -1
433 |             )
434 | 
435 |             query_output = self.Qformer.bert(
436 |                 query_embeds=query_tokens,
437 |                 encoder_hidden_states=image_embeds_frozen,
438 |                 encoder_attention_mask=image_atts,
439 |                 return_dict=True,
440 |             )
441 |             image_embeds = query_output.last_hidden_state
442 |             image_features = F.normalize(self.vision_proj(image_embeds), dim=-1)
443 | 
444 |         elif mode == "text":
445 |             assert (
446 |                 caption is not None
447 |             ), "text input is None for mode 'text' or 'multimodal'"
448 | 
449 |             # return text features
450 |             text = self.tokenizer(caption, return_tensors="pt", padding=True).to(
451 |                 self.device
452 |             )
453 | 
454 |             text_output = self.Qformer.bert(
455 |                 text.input_ids,
456 |                 attention_mask=text.attention_mask,
457 |                 return_dict=True,
458 |             )
459 |             text_embeds = text_output.last_hidden_state
460 |             text_features = self.text_proj(text_embeds)
461 |             text_features = F.normalize(text_features, dim=-1)
462 | 
463 |         elif mode == "multimodal":
464 |             # return multimodel query features
465 |             with self.maybe_autocast():
466 |                 image_embeds_frozen = self.ln_vision(self.visual_encoder(image))
467 |             image_embeds_frozen = image_embeds_frozen.float()
468 |             image_atts = torch.ones(
469 |                 image_embeds_frozen.size()[:-1], dtype=torch.long
470 |             ).to(self.device)
471 |             query_tokens = self.query_tokens.expand(
472 |                 image_embeds_frozen.shape[0], -1, -1
473 |             )
474 |             query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(
475 |                 self.device
476 |             )
477 | 
478 |             text = self.tokenizer(caption, return_tensors="pt", padding=True).to(
479 |                 self.device
480 |             )
481 |             attention_mask = torch.cat([query_atts, text.attention_mask], dim=1)
482 | 
483 |             output = self.Qformer.bert(
484 |                 text.input_ids,
485 |                 query_embeds=query_tokens,
486 |                 attention_mask=attention_mask,
487 |                 encoder_hidden_states=image_embeds_frozen,
488 |                 encoder_attention_mask=image_atts,
489 |                 return_dict=True,
490 |             )
491 | 
492 |             multimodal_embeds = output.last_hidden_state[:, : query_tokens.size(1), :]
493 | 
494 |         return BlipOutputFeatures(
495 |             image_embeds=image_embeds,
496 |             image_embeds_proj=image_features,
497 |             text_embeds=text_embeds,
498 |             text_embeds_proj=text_features,
499 |             multimodal_embeds=multimodal_embeds,
500 |         )
501 | 
502 |     @classmethod
503 |     def from_config(cls, cfg):
504 |         vit_model = cfg.get("vit_model", "eva_clip_g")
505 |         img_size = cfg.get("image_size")
506 |         num_query_token = cfg.get("num_query_token")
507 |         cross_attention_freq = cfg.get("cross_attention_freq", 2)
508 | 
509 |         drop_path_rate = cfg.get("drop_path_rate", 0)
510 |         use_grad_checkpoint = cfg.get("use_grad_checkpoint", False)
511 |         vit_precision = cfg.get("vit_precision", "fp16")
512 |         freeze_vit = cfg.get("freeze_vit", True)
513 | 
514 |         max_txt_len = cfg.get("max_txt_len", 32)
515 | 
516 |         model = cls(
517 |             vit_model=vit_model,
518 |             img_size=img_size,
519 |             drop_path_rate=drop_path_rate,
520 |             use_grad_checkpoint=use_grad_checkpoint,
521 |             vit_precision=vit_precision,
522 |             freeze_vit=freeze_vit,
523 |             num_query_token=num_query_token,
524 |             cross_attention_freq=cross_attention_freq,
525 |             max_txt_len=max_txt_len,
526 |         )
527 | 
528 |         load_finetuned = cfg.get("load_finetuned", True)
529 |         load_pretrained = cfg.get("load_pretrained", True)
530 |         if load_finetuned or load_pretrained:
531 |             if load_pretrained:
532 |                 logging.info("Load pretrained from {}".format(cfg["pretrained"]))
533 |             else:
534 |                 logging.info("Load finetuned from {}".format(cfg["fintuned"]))
535 |             model.load_checkpoint_from_config(cfg)
536 |         else:
537 |             logging.info("Learning from scratch")
538 | 
539 |         return model
540 | 
541 |     def compute_sim_matrix(self, data_loader, task_cfg):
542 |         """
543 |         Compute similarity i2t, t2i matrix for the given data loader.
544 |         """
545 |         k_test = task_cfg.k_test
546 | 
547 |         return compute_sim_matrix(model=self, data_loader=data_loader, k_test=k_test)
548 | 
549 | 
550 | if __name__ == "__main__":
551 |     blip2baseZh = Blip2BaseZh()
552 | 
553 |     tokenizer = blip2baseZh.init_tokenizer()
554 |     #print(tokenizer)
555 | 
556 |     Qformer, query_tokens =  blip2baseZh.init_Qformer(num_query_token=1024,vision_width=768)
557 |     #print(Qformer)
558 |     #print(query_tokens)
559 | 
560 |     test_sentence = '你好，介绍下你自己.'
561 |     #testinput = tokenizer(test_sentence)['input_ids']
562 |     #print(testinput)
563 | 
564 |     text = tokenizer(test_sentence, return_tensors="pt", padding=True)
565 |     blip2Zhqformer = Blip2ZhQformer()
566 |     print(blip2Zhqformer.forward_text(text))


--------------------------------------------------------------------------------
/Blip2RWKV/configuration_blip2rwkv.py:
--------------------------------------------------------------------------------
 1 | from transformers import (
 2 |     PretrainedConfig,
 3 |     Blip2VisionConfig, Blip2QFormerConfig
 4 | )
 5 | from configuration_rwkv import RwkvConfig
 6 | 
 7 | import copy
 8 | from transformers.configuration_utils import PretrainedConfig
 9 | from transformers.utils import logging
10 | 
11 | logger = logging.get_logger(__name__)
12 | 
13 | 
14 | 
15 | class Blip2RWKVConfig(PretrainedConfig):
16 |     """Mainly based on Blip2Config
17 | 
18 |     Args:
19 |         PretrainedConfig (_type_): _description_
20 |     """
21 |     is_composition = True
22 | 
23 |     def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=27, **kwargs):
24 |         super().__init__(**kwargs)
25 | 
26 |         if vision_config is None:
27 |             vision_config = {}
28 |             logger.info("vision_config is None. initializing the Blip2VisionConfig with default values.")
29 | 
30 |         if qformer_config is None:
31 |             qformer_config = {}
32 |             logger.info("qformer_config is None. Initializing the Blip2QFormerConfig with default values.")
33 | 
34 |         if text_config is None:
35 |             text_config = {}
36 |             logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
37 | 
38 |         self.vision_config = Blip2VisionConfig(**vision_config)
39 |         self.qformer_config = Blip2QFormerConfig(**qformer_config)
40 |         # text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
41 |         # self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
42 |         self.text_config = RwkvConfig(**text_config)
43 | 
44 |         # self.tie_word_embeddings = self.text_config.tie_word_embeddings
45 |         self.tie_word_embeddings = False                # I don't know what this is
46 |         # self.is_encoder_decoder = self.text_config.is_encoder_decoder
47 |         self.is_encoder_decoder = True                  # chatglm is an encoder-decoder model
48 | 
49 |         self.num_query_tokens = num_query_tokens
50 |         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
51 |         # self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
52 |         self.use_decoder_only_language_model = True             # chatglm has no encoder
53 |         self.initializer_factor = 1.0
54 |         self.initializer_range = 0.02
55 | 
56 |     @classmethod
57 |     def from_vision_qformer_text_configs(
58 |         cls,
59 |         vision_config: Blip2VisionConfig,
60 |         qformer_config: Blip2QFormerConfig,
61 |         text_config: PretrainedConfig,
62 |         **kwargs,
63 |     ):
64 |         r"""
65 |         Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model
66 |         configurations.
67 | 
68 |         Returns:
69 |             [`Blip2Config`]: An instance of a configuration object
70 |         """
71 | 
72 |         return cls(
73 |             vision_config=vision_config.to_dict(),
74 |             qformer_config=qformer_config.to_dict(),
75 |             text_config=text_config.to_dict(),
76 |             **kwargs,
77 |         )
78 | 
79 |     def to_dict(self):
80 |         """
81 |         Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
82 | 
83 |         Returns:
84 |             `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
85 |         """
86 |         output = copy.deepcopy(self.__dict__)
87 |         output["vision_config"] = self.vision_config.to_dict()
88 |         output["qformer_config"] = self.qformer_config.to_dict()
89 |         output["text_config"] = self.text_config.to_dict()
90 |         output["model_type"] = self.__class__.model_type
91 |         return output
92 | 
93 | if __name__ == "__main__":
94 |     blip2rwkvconfig = Blip2RWKVConfig()
95 |     print(blip2rwkvconfig)
96 | 


--------------------------------------------------------------------------------
/Blip2RWKV/configuration_rwkv.py:
--------------------------------------------------------------------------------
  1 | from transformers import PretrainedConfig,RwkvConfig
  2 | 
  3 | 
  4 | class RwkvConfig(PretrainedConfig):
  5 |     """
  6 |     This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
  7 |     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
  8 |     defaults will yield a similar configuration to that of the RWVK-4
  9 |     [RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture.
 10 | 
 11 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 12 |     documentation from [`PretrainedConfig`] for more information.
 13 | 
 14 | 
 15 |     Args:
 16 |         vocab_size (`int`, *optional*, defaults to 50277):
 17 |             Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the
 18 |             `inputs_ids` passed when calling [`RwkvModel`].
 19 |         context_length (`int`, *optional*, defaults to 1024):
 20 |             The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode
 21 |             lets use any sequence length).
 22 |         hidden_size (`int`, *optional*, defaults to 4096):
 23 |             Dimensionality of the embeddings and hidden states.
 24 |         num_hidden_layers (`int`, *optional*, defaults to 32):
 25 |             Number of hidden layers in the model.
 26 |         attention_hidden_size (`int`, *optional*):
 27 |             Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
 28 |         intermediate_size (`int`, *optional*):
 29 |             Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
 30 |         layer_norm_eps (`float`, *optional*, defaults to 1e-5):
 31 |             The epsilon to use in the layer normalization layers.
 32 |         bos_token_id (`int`, *optional*, defaults to 0):
 33 |             The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer
 34 |             as GPTNeoX.
 35 |         eos_token_id (`int`, *optional*, defaults to 0):
 36 |             The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as
 37 |             GPTNeoX.
 38 |         rescale_every (`int`, *optional*, default to 6):
 39 |             At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
 40 |             `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
 41 |         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
 42 |             Whether or not to tie the word embeddings with the input token embeddings.
 43 |         use_cache (`bool`, *optional*, defaults to `True`):
 44 |             Whether or not the model should return the last state.
 45 | 
 46 | 
 47 |     Example:
 48 | 
 49 |     ```python
 50 |     >>> from transformers import RwkvConfig, RwkvModel
 51 | 
 52 |     >>> # Initializing a Rwkv configuration
 53 |     >>> configuration = RwkvConfig()
 54 | 
 55 |     >>> # Initializing a model (with random weights) from the configuration
 56 |     >>> model = RwkvModel(configuration)
 57 | 
 58 |     >>> # Accessing the model configuration
 59 |     >>> configuration = model.config
 60 |     ```"""
 61 | 
 62 |     model_type = "rwkv"
 63 |     attribute_map = {"max_position_embeddings": "context_length"}
 64 | 
 65 |     def __init__(
 66 |         self,
 67 |         vocab_size=50277,
 68 |         context_length=1024,
 69 |         hidden_size=2560,
 70 |         num_hidden_layers=32,
 71 |         attention_hidden_size=2560,
 72 |         intermediate_size=10240,
 73 |         layer_norm_epsilon=1e-5,
 74 |         bos_token_id=0,
 75 |         eos_token_id=0,
 76 |         rescale_every=6,
 77 |         tie_word_embeddings=False,
 78 |         use_cache=True,
 79 |         **kwargs,
 80 |     ):
 81 |         self.vocab_size = vocab_size
 82 |         self.context_length = context_length
 83 |         self.hidden_size = hidden_size
 84 |         self.num_hidden_layers = num_hidden_layers
 85 |         self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
 86 |         self.intermediate_size = intermediate_size if intermediate_size is not None else 4 * hidden_size
 87 |         self.layer_norm_epsilon = layer_norm_epsilon
 88 |         self.rescale_every = rescale_every
 89 |         self.use_cache = use_cache
 90 | 
 91 |         self.bos_token_id = bos_token_id
 92 |         self.eos_token_id = eos_token_id
 93 | 
 94 |         super().__init__(
 95 |             tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
 96 |         )
 97 | 
 98 | if __name__ == "__main__":
 99 |     rwkvconfig = RwkvConfig()
100 |     print(rwkvconfig)


--------------------------------------------------------------------------------
/Blip2RWKV/gen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StarRing2022/MiniRWKV-4/f444b49dd9f7fb699e7806d8478cd8c9f9ea926b/Blip2RWKV/gen.png


--------------------------------------------------------------------------------
/Blip2RWKV/modeling_blip2rwkv.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import os
  3 | from typing import Callable, List, Optional, Tuple, Union
  4 | import numpy as np
  5 | import torch
  6 | from torch.nn import CrossEntropyLoss
  7 | from torch.nn.utils.rnn import pad_sequence
  8 | import warnings
  9 | from torch import Tensor, nn
 10 | 
 11 | from transformers import (
 12 |     PreTrainedModel,
 13 |     PreTrainedTokenizer,
 14 |     Blip2VisionModel,
 15 |     Blip2QFormerModel,
 16 |     Blip2Model,
 17 |     Blip2PreTrainedModel,
 18 |     Blip2ForConditionalGeneration,
 19 |     GenerationConfig,
 20 | )
 21 | from transformers.models.blip_2.modeling_blip_2 import (
 22 |     Blip2ForConditionalGenerationModelOutput,
 23 | )
 24 | from transformers.utils import logging
 25 | from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList
 26 | 
 27 | from modeling_rwkv import (
 28 |     RwkvForCausalLM
 29 | )
 30 | from configuration_blip2rwkv import Blip2RWKVConfig
 31 | 
 32 | 
 33 | logger = logging.get_logger(__name__)
 34 | 
 35 | 
 36 | class Blip2RWKVConditionalGeneration(Blip2ForConditionalGeneration):
 37 |     config_class = Blip2RWKVConfig
 38 | 
 39 |     def __init__(self, config: Blip2RWKVConfig):
 40 |         Blip2PreTrainedModel.__init__(self, config)
 41 |         # NOTE: we only initialize Blip2PreTrainedModel
 42 |         # directly call super().__init__() will cause error since ChatGLM cannot be found by AutoModel
 43 | 
 44 |         self.vision_model = Blip2VisionModel(config.vision_config).to("cuda")
 45 | 
 46 |         self.query_tokens = nn.Parameter(
 47 |             torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size).to("cuda")
 48 |         )
 49 |         self.qformer = Blip2QFormerModel(config.qformer_config).to("cuda")
 50 | 
 51 |         self.language_projection = nn.Linear(
 52 |             config.qformer_config.hidden_size, config.text_config.hidden_size
 53 |         ).to("cuda")
 54 |         #self.language_model = RwkvForCausalLM(config.text_config)
 55 |         self.language_model = RwkvForCausalLM.from_pretrained("RWKV-4-Raven-3B-v11-zh",device_map='auto').to("cuda")
 56 |         #print(self.language_model )
 57 | 
 58 |         # Initialize weights and apply final processing
 59 |         # self.post_init()
 60 | 
 61 |     def setup_dtype(self, vision_encoder_dtype: str = "fp32", lm_dtype: str = "fp16"):
 62 |         if vision_encoder_dtype == "fp32":
 63 |             self.vision_model = self.vision_model.float().cuda()
 64 |         elif vision_encoder_dtype == "fp16":
 65 |             self.vision_model = self.vision_model.half().cuda()
 66 |         else:
 67 |             raise NotImplementedError(
 68 |                 f"Unsupported vision_encoder_dtype: {vision_encoder_dtype}"
 69 |             )
 70 | 
 71 |         if lm_dtype == "fp32":
 72 |             self.language_model = self.language_model.float()
 73 |         elif lm_dtype == "fp16":
 74 |             self.language_model = self.language_model.half()
 75 |         elif lm_dtype == "int4":
 76 |             self.language_model = self.language_model.half().quantize(4)
 77 |         elif lm_dtype == "int8":
 78 |             self.language_model = self.language_model.half().quantize(8)
 79 |         else:
 80 |             raise NotImplementedError(f"Unsupported lm_dtype: {lm_dtype}")
 81 | 
 82 |     def forward(
 83 |         self,
 84 |         pixel_values: torch.FloatTensor,
 85 |         input_ids: torch.FloatTensor,
 86 |         image_slot_offset: Optional[torch.LongTensor] = None,
 87 |         attention_mask: Optional[torch.LongTensor] = None,
 88 |         output_attentions: Optional[bool] = None,
 89 |         output_hidden_states: Optional[bool] = None,
 90 |         labels: Optional[torch.LongTensor] = None,
 91 |         return_dict: Optional[bool] = None,
 92 |     ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
 93 |         """_summary_
 94 | 
 95 |         Args:
 96 |             pixel_values (torch.FloatTensor): _description_
 97 |             input_ids (torch.FloatTensor): input_ids[:, :num_query_tokens] should be filled with tokenizer.unk_token_id
 98 |             image_slot_offset (Optional[torch.LongTensor], optional): if not set, all vtokens are placed as prefix (image_slot_offset = torch.zeros(bsz)). Defaults to None.
 99 |             attention_mask (Optional[torch.LongTensor], optional): _description_. Defaults to None.
100 |             output_attentions (Optional[bool], optional): _description_. Defaults to None.
101 |             output_hidden_states (Optional[bool], optional): _description_. Defaults to None.
102 |             labels (Optional[torch.LongTensor], optional): _description_. Defaults to None.
103 |             return_dict (Optional[bool], optional): _description_. Defaults to None.
104 | 
105 |         Returns:
106 |             Union[Tuple, Blip2ForConditionalGenerationModelOutput]: _description_
107 |         """
108 |         return_dict = (
109 |             return_dict if return_dict is not None else self.config.use_return_dict
110 |         )
111 | 
112 |         # step 1: forward the images through the vision encoder,
113 |         # to get image embeddings of shape (batch_size, seq_len, hidden_size)
114 |         vision_outputs = self.vision_model(
115 |             pixel_values=pixel_values,
116 |             output_attentions=output_attentions,
117 |             output_hidden_states=output_hidden_states,
118 |             return_dict=return_dict,
119 |         )
120 |         image_embeds = vision_outputs[0]
121 | 
122 |         # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
123 |         image_attention_mask = torch.ones(
124 |             image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device
125 |         )
126 | 
127 |         query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
128 |         query_outputs = self.qformer(
129 |             query_embeds=query_tokens,
130 |             encoder_hidden_states=image_embeds,
131 |             encoder_attention_mask=image_attention_mask,
132 |             output_attentions=output_attentions,
133 |             output_hidden_states=output_hidden_states,
134 |             return_dict=return_dict,
135 |         )
136 |         query_output = query_outputs[0]
137 | 
138 |         # step 3: use the language model, conditioned on the query outputs and the prompt
139 |         # 关键步骤，将图片进行embedding编码，然后送入LM
140 |         language_model_inputs = self.language_projection(query_output)
141 |         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
142 |         #print(inputs_embeds.shape) #[1,27,2560]
143 |         if image_slot_offset is None:
144 |             # image as prefix
145 |             # update data to avoid inplace operation of leaf Variable
146 |             inputs_embeds.data[
147 |                 :, : self.config.num_query_tokens, : # num_query_tokens = 27
148 |             ] = language_model_inputs
149 |         else:
150 |             for i, offset in enumerate(image_slot_offset):
151 |                 inputs_embeds.data[
152 |                     i, offset : offset + self.config.num_query_tokens, :
153 |                 ] = language_model_inputs[i]
154 | 
155 |         outputs = self.language_model(
156 |             input_ids=input_ids,
157 |             inputs_embeds=inputs_embeds,
158 |             attention_mask=attention_mask,
159 |             output_attentions=output_attentions,
160 |             output_hidden_states=output_hidden_states,
161 |             return_dict=return_dict,
162 |         )
163 |         logits = outputs.logits if return_dict else outputs[0]
164 |         loss = None
165 |         # we compute the loss here since we need to take into account the sequence length of the query embeds
166 |         if labels is not None:
167 |             logits = logits[:, -labels.size(1) :, :]
168 |             # Shift so that tokens < n predict n
169 |             shift_logits = logits[..., :-1, :].contiguous()
170 |             shift_labels = labels[..., 1:].contiguous().to(logits.device)
171 | 
172 |             # Flatten the tokens
173 |             loss_fct = CrossEntropyLoss(reduction="mean")
174 | 
175 |             loss = loss_fct(
176 |                 shift_logits.view(-1, self.config.text_config.vocab_size),
177 |                 shift_labels.view(-1),
178 |             )
179 | 
180 |         if not return_dict:
181 |             output = (logits, vision_outputs, query_outputs, outputs)
182 |             return ((loss,) + output) if loss is not None else output
183 | 
184 |         return Blip2ForConditionalGenerationModelOutput(
185 |             loss=loss,
186 |             logits=logits,
187 |             vision_outputs=vision_outputs,
188 |             qformer_outputs=query_outputs,
189 |             language_model_outputs=outputs,
190 |         )
191 | 
192 | if __name__ == "__main__":
193 |     #Blip2RWKV测试
194 |     blip2rwkvconfig = Blip2RWKVConfig()
195 |     blip2RWKVConditionalGeneration = Blip2RWKVConditionalGeneration(config=blip2rwkvconfig)
196 |     blip2RWKVConditionalGeneration.setup_dtype()
197 | 
198 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
199 |     #device = "cpu"
200 |     
201 |     from PIL import Image
202 |     from transformers import BlipProcessor
203 |     from lavis.models import load_model_and_preprocess
204 |     
205 |     raw_image = Image.open('gen.png').convert('RGB')
206 |     caption = "一个男孩抱着一只猫，猫咪看起来很享受。"
207 | 
208 |     model, vis_processors, txt_processors = load_model_and_preprocess(name="blip2_feature_extractor", model_type="pretrain", is_eval=True, device=device)
209 |     model = model.to(device)
210 | 
211 |     image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
212 |     text_input = txt_processors["eval"](caption)
213 |     sample = {"image": image, "text_input": [text_input]}
214 | 
215 |     #print(image.shape)
216 |     #print(text_input)
217 | 
218 |     from transformers import GPTNeoXTokenizerFast
219 |     tokenizer = GPTNeoXTokenizerFast.from_pretrained("RWKV-4-Raven-3B-v11-zh")
220 |     text_input = tokenizer.encode(text_input, return_tensors='pt')
221 |     blip2rwkvoutput = blip2RWKVConditionalGeneration.forward(pixel_values=image.to(device),input_ids=text_input.to(device),labels=text_input.to(device))
222 |     #print(blip2rwkvoutput)


--------------------------------------------------------------------------------
/Blip2RWKV/modeling_rwkv.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2023 Bo Peng and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """PyTorch RWKV model."""
 17 | 
 18 | import math
 19 | from dataclasses import dataclass
 20 | from pathlib import Path
 21 | from typing import List, Optional, Tuple, Union
 22 | 
 23 | import torch
 24 | import torch.utils.checkpoint
 25 | from torch import nn
 26 | from torch.nn import CrossEntropyLoss
 27 | 
 28 | from transformers import PreTrainedModel
 29 | from transformers.utils import (
 30 |     ModelOutput,
 31 |     add_code_sample_docstrings,
 32 |     add_start_docstrings,
 33 |     add_start_docstrings_to_model_forward,
 34 |     is_ninja_available,
 35 |     is_torch_cuda_available,
 36 |     logging,
 37 | )
 38 | from configuration_rwkv import RwkvConfig
 39 | 
 40 | 
 41 | logger = logging.get_logger(__name__)
 42 | 
 43 | _CHECKPOINT_FOR_DOC = "RWKV/rwkv-4-169m-pile"
 44 | _CONFIG_FOR_DOC = "RwkvConfig"
 45 | 
 46 | RWKV_PRETRAINED_MODEL_ARCHIVE_LIST = [
 47 |     "RWKV/rwkv-4-169m-pile",
 48 |     "RWKV/rwkv-4-430m-pile",
 49 |     "RWKV/rwkv-4-1b5-pile",
 50 |     "RWKV/rwkv-4-3b-pile",
 51 |     "RWKV/rwkv-4-7b-pile",
 52 |     "RWKV/rwkv-4-14b-pile",
 53 |     "RWKV/rwkv-raven-1b5",
 54 |     "RWKV/rwkv-raven-3b",
 55 |     "RWKV/rwkv-raven-7b",
 56 |     "RWKV/rwkv-raven-14b",
 57 |     # See all RWKV models at https://huggingface.co/models?filter=rwkv
 58 | ]
 59 | 
 60 | 
 61 | rwkv_cuda_kernel = None
 62 | 
 63 | 
 64 | def load_wkv_cuda_kernel(context_length):
 65 |     from torch.utils.cpp_extension import load as load_kernel
 66 | 
 67 |     global rwkv_cuda_kernel
 68 | 
 69 |     kernel_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "rwkv"
 70 |     cuda_kernel_files = [kernel_folder / f for f in ["wkv_op.cpp", "wkv_cuda.cu", "wkv_cuda_bf16.cu"]]
 71 | 
 72 |     # Only load the kernel if it's not been loaded yet or if we changed the context length
 73 |     if rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == context_length:
 74 |         return
 75 | 
 76 |     logger.info(f"Loading CUDA kernel for RWKV at context length of {context_length}.")
 77 | 
 78 |     flags = [
 79 |         "-res-usage",
 80 |         "--maxrregcount 60",
 81 |         "--use_fast_math",
 82 |         "-O3",
 83 |         "-Xptxas -O3",
 84 |         "--extra-device-vectorization",
 85 |         f"-DTmax={context_length}",
 86 |     ]
 87 |     rwkv_cuda_kernel = load_kernel(
 88 |         name=f"wkv_{context_length}",
 89 |         sources=cuda_kernel_files,
 90 |         verbose=(logging.get_verbosity() == logging.DEBUG),
 91 |         extra_cuda_cflags=flags,
 92 |     )
 93 |     rwkv_cuda_kernel.max_seq_length = context_length
 94 | 
 95 | 
 96 | class RwkvLinearAttention(torch.autograd.Function):
 97 |     @staticmethod
 98 |     def forward(ctx, time_decay, time_first, key, value, state=None, return_state=False):
 99 |         batch_size, seq_len, hidden_size = key.size()
100 |         if seq_len > rwkv_cuda_kernel.max_seq_length:
101 |             raise ValueError(
102 |                 f"Cannot process a batch with {seq_len} tokens at the same time, use a maximum of "
103 |                 f"{rwkv_cuda_kernel.max_seq_length} with this model."
104 |             )
105 |         if batch_size * hidden_size % min(hidden_size, 32) != 0:
106 |             raise ValueError(
107 |                 f"The product of batch size ({batch_size}) and hidden size ({hidden_size}) needs to be a round "
108 |                 f"multiple of {min(hidden_size, 32)}."
109 |             )
110 | 
111 |         ctx.input_dtype = key.dtype
112 | 
113 |         if (
114 |             time_decay.device.type != "cuda"
115 |             or time_first.device.type != "cuda"
116 |             or key.device.type != "cuda"
117 |             or value.device.type != "cuda"
118 |         ):
119 |             raise ValueError("Calling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.")
120 | 
121 |         time_decay = -torch.exp(time_decay.float().contiguous())
122 |         if key.dtype == torch.float16:
123 |             time_first = time_first.float()
124 |             key = key.float()
125 |             value = value.float()
126 |         time_first = time_first.contiguous()
127 |         key = key.contiguous()
128 |         value = value.contiguous()
129 |         # The CUDA kernel will fill this tensor.
130 |         output = torch.empty_like(key, memory_format=torch.contiguous_format)
131 |         if return_state or state is not None:
132 |             if state is None:
133 |                 state = torch.zeros(
134 |                     batch_size,
135 |                     hidden_size,
136 |                     3,
137 |                     dtype=torch.float32,
138 |                     device=key.device,
139 |                     memory_format=torch.contiguous_format,
140 |                 )
141 |                 state[:, :, 2] -= 1e38
142 |             else:
143 |                 state = torch.cat([s.unsqueeze(2) for s in state], dim=2).contiguous()
144 |             if key.dtype == torch.bfloat16:
145 |                 forward_func = rwkv_cuda_kernel.forward_with_state_bf16
146 |             else:
147 |                 forward_func = rwkv_cuda_kernel.forward_with_state
148 |             forward_func(time_decay, time_first, key, value, output, state)
149 |         else:
150 |             forward_func = rwkv_cuda_kernel.forward_bf16 if key.dtype == torch.bfloat16 else rwkv_cuda_kernel.forward
151 |             forward_func(time_decay, time_first, key, value, output)
152 | 
153 |         ctx.save_for_backward(time_decay, time_first, key, value, output)
154 | 
155 |         if state is not None:
156 |             state = [s.squeeze(2) for s in torch.chunk(state, 3, dim=2)]
157 | 
158 |         return output.to(ctx.input_dtype), state
159 | 
160 |     @staticmethod
161 |     # g stands for grad
162 |     def backward(ctx, g_output):
163 |         input_dtype = ctx.input_dtype
164 | 
165 |         time_decay, time_first, key, value, output = ctx.saved_tensors
166 |         # The CUDA kernel will fill those tensors.
167 |         g_time_decay = torch.empty_like(
168 |             time_decay,
169 |             memory_format=torch.contiguous_format,
170 |             dtype=torch.bfloat16 if input_dtype == torch.bfloat16 else torch.float32,
171 |         )
172 |         g_time_first = torch.empty_like(time_first, memory_format=torch.contiguous_format)
173 |         g_key = torch.empty_like(key, memory_format=torch.contiguous_format)
174 |         g_value = torch.empty_like(value, memory_format=torch.contiguous_format)
175 | 
176 |         if input_dtype == torch.float16:
177 |             g_output = g_output.float()
178 |         backward_func = rwkv_cuda_kernel.backward_bf16 if input_dtype == torch.bfloat16 else rwkv_cuda_kernel.backward
179 |         backward_func(
180 |             time_decay,
181 |             time_first,
182 |             key,
183 |             value,
184 |             output,
185 |             g_output.contiguous(),
186 |             g_time_decay,
187 |             g_time_first,
188 |             g_key,
189 |             g_value,
190 |         )
191 |         g_time_decay = torch.sum(g_time_decay, dim=0)
192 |         g_time_first = torch.sum(g_time_first, dim=0)
193 | 
194 |         return (
195 |             None,
196 |             None,
197 |             None,
198 |             g_time_decay.to(input_dtype),
199 |             g_time_first.to(input_dtype),
200 |             g_key.to(input_dtype),
201 |             g_value.to(input_dtype),
202 |         )
203 | 
204 | 
205 | def rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=None, return_state=False):
206 |     # For CPU fallback. Will be slower and probably take more memory than the custom CUDA kernel if not executed
207 |     # within a torch.no_grad.
208 |     _, seq_length, _ = key.size()
209 |     output = torch.zeros_like(key)
210 | 
211 |     if state is None:
212 |         num_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
213 |         den_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
214 |         max_state = torch.zeros_like(key[:, 0], dtype=torch.float32) - 1e38
215 |     else:
216 |         num_state, den_state, max_state = state
217 |     # For numerical stability
218 |     #    real_numerator_state = num_state * torch.exp(max_state)
219 |     #    real_denominator_state = den_state * torch.exp(max_state)
220 | 
221 |     time_decay = -torch.exp(time_decay)
222 | 
223 |     for current_index in range(seq_length):
224 |         current_key = key[:, current_index].float()
225 |         current_value = value[:, current_index]
226 | 
227 |         # wkv computation at time t
228 |         max_for_output = torch.maximum(max_state, current_key + time_first)
229 |         e1 = torch.exp(max_state - max_for_output)
230 |         e2 = torch.exp(current_key + time_first - max_for_output)
231 |         numerator = e1 * num_state + e2 * current_value
232 |         denominator = e1 * den_state + e2
233 |         output[:, current_index] = (numerator / denominator).to(output.dtype)
234 | 
235 |         # Update state for next iteration
236 |         max_for_state = torch.maximum(max_state + time_decay, current_key)
237 |         e1 = torch.exp(max_state + time_decay - max_for_state)
238 |         e2 = torch.exp(current_key - max_for_state)
239 |         num_state = e1 * num_state + e2 * current_value
240 |         den_state = e1 * den_state + e2
241 |         max_state = max_for_state
242 | 
243 |     if return_state or state is not None:
244 |         state = [num_state, den_state, max_state]
245 | 
246 |     return output, state
247 | 
248 | 
249 | def rwkv_linear_attention(time_decay, time_first, key, value, state=None, return_state=False):
250 |     no_cuda = any(t.device.type != "cuda" for t in [time_decay, time_first, key, value])
251 |     # Launching the CUDA kernel for just one token will actually be slower (there is no for loop in the CPU version
252 |     # in this case).
253 |     one_token = key.size(1) == 1
254 |     if rwkv_cuda_kernel is None or no_cuda or one_token:
255 |         return rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=state, return_state=return_state)
256 |     else:
257 |         return RwkvLinearAttention.apply(time_decay, time_first, key, value, state, return_state)
258 | 
259 | 
260 | class RwkvSelfAttention(nn.Module):
261 |     def __init__(self, config, layer_id=0):
262 |         super().__init__()
263 |         self.config = config
264 |         kernel_loaded = rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == config.context_length
265 |         if is_ninja_available() and is_torch_cuda_available() and not kernel_loaded:
266 |             try:
267 |                 load_wkv_cuda_kernel(config.context_length)
268 |             except Exception:
269 |                 logger.info("Could not load the custom CUDA kernel for RWKV attention.")
270 |         self.layer_id = layer_id
271 |         hidden_size = config.hidden_size
272 |         attention_hidden_size = (
273 |             config.attention_hidden_size if config.attention_hidden_size is not None else hidden_size
274 |         )
275 |         self.attention_hidden_size = attention_hidden_size
276 | 
277 |         self.time_decay = nn.Parameter(torch.empty(attention_hidden_size))
278 |         self.time_first = nn.Parameter(torch.empty(attention_hidden_size))
279 | 
280 |         self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
281 |         self.time_mix_value = nn.Parameter(torch.empty(1, 1, hidden_size))
282 |         self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))
283 | 
284 |         self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
285 |         self.key = nn.Linear(hidden_size, attention_hidden_size, bias=False)
286 |         self.value = nn.Linear(hidden_size, attention_hidden_size, bias=False)
287 |         self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False)
288 |         self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False)
289 | 
290 |     # TODO: maybe jit, otherwise move inside forward
291 |     def extract_key_value(self, hidden, state=None):
292 |         # Mix hidden with the previous timestep to produce key, value, receptance
293 |         if hidden.size(1) == 1 and state is not None:
294 |             shifted = state[1][:, :, self.layer_id]
295 |         else:
296 |             shifted = self.time_shift(hidden)
297 |             if state is not None:
298 |                 shifted[:, 0] = state[1][:, :, self.layer_id]
299 |         key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
300 |         value = hidden * self.time_mix_value + shifted * (1 - self.time_mix_value)
301 |         receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
302 | 
303 |         key = self.key(key)
304 |         value = self.value(value)
305 |         receptance = torch.sigmoid(self.receptance(receptance))
306 |         if state is not None:
307 |             state[1][:, :, self.layer_id] = hidden[:, -1]
308 |         return receptance, key, value, state
309 | 
310 |     def forward(self, hidden, state=None, use_cache=False):
311 |         receptance, key, value, state = self.extract_key_value(hidden, state=state)
312 |         layer_state = tuple(s[:, :, self.layer_id] for s in state[2:]) if state is not None else None
313 |         rwkv, layer_state = rwkv_linear_attention(
314 |             self.time_decay,
315 |             self.time_first,
316 |             key,
317 |             value,
318 |             state=layer_state,
319 |             return_state=use_cache,
320 |         )
321 | 
322 |         if layer_state is not None:
323 |             state[2][:, :, self.layer_id] = layer_state[0]
324 |             state[3][:, :, self.layer_id] = layer_state[1]
325 |             state[4][:, :, self.layer_id] = layer_state[2]
326 | 
327 |         return self.output(receptance * rwkv), state
328 | 
329 | 
330 | class RwkvFeedForward(nn.Module):
331 |     def __init__(self, config, layer_id=0):
332 |         super().__init__()
333 |         self.config = config
334 |         self.layer_id = layer_id
335 |         hidden_size = config.hidden_size
336 |         intermediate_size = (
337 |             config.intermediate_size if config.intermediate_size is not None else 4 * config.hidden_size
338 |         )
339 | 
340 |         self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
341 |         self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
342 |         self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))
343 | 
344 |         self.key = nn.Linear(hidden_size, intermediate_size, bias=False)
345 |         self.receptance = nn.Linear(hidden_size, hidden_size, bias=False)
346 |         self.value = nn.Linear(intermediate_size, hidden_size, bias=False)
347 | 
348 |     def forward(self, hidden, state=None):
349 |         if hidden.size(1) == 1 and state is not None:
350 |             shifted = state[0][:, :, self.layer_id]
351 |         else:
352 |             shifted = self.time_shift(hidden)
353 |             if state is not None:
354 |                 shifted[:, 0] = state[0][:, :, self.layer_id]
355 |         key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
356 |         receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
357 | 
358 |         key = torch.square(torch.relu(self.key(key)))
359 |         value = self.value(key)
360 |         receptance = torch.sigmoid(self.receptance(receptance))
361 | 
362 |         if state is not None:
363 |             state[0][:, :, self.layer_id] = hidden[:, -1]
364 | 
365 |         return receptance * value, state
366 | 
367 | 
368 | class RwkvBlock(nn.Module):
369 |     def __init__(self, config, layer_id):
370 |         super().__init__()
371 |         self.config = config
372 |         self.layer_id = layer_id
373 | 
374 |         if layer_id == 0:
375 |             self.pre_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
376 | 
377 |         self.ln1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
378 |         self.ln2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
379 | 
380 |         self.attention = RwkvSelfAttention(config, layer_id)
381 |         self.feed_forward = RwkvFeedForward(config, layer_id)
382 | 
383 |     def forward(self, hidden, state=None, use_cache=False, output_attentions=False):
384 |         if self.layer_id == 0:
385 |             hidden = self.pre_ln(hidden)
386 | 
387 |         attention, state = self.attention(self.ln1(hidden), state=state, use_cache=use_cache)
388 |         hidden = hidden + attention
389 | 
390 |         feed_forward, state = self.feed_forward(self.ln2(hidden), state=state)
391 |         hidden = hidden + feed_forward
392 | 
393 |         outputs = (hidden, state)
394 |         if output_attentions:
395 |             outputs += (attention,)
396 |         else:
397 |             outputs += (None,)
398 | 
399 |         return outputs
400 | 
401 | 
402 | class RwkvPreTrainedModel(PreTrainedModel):
403 |     """
404 |     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
405 |     models.
406 |     """
407 | 
408 |     config_class = RwkvConfig
409 |     base_model_prefix = "rwkv"
410 |     _no_split_modules = ["RwkvBlock"]
411 |     _keep_in_fp32_modules = ["time_decay", "time_first"]
412 | 
413 |     def _init_weights(self, module):
414 |         """Initialize the weights."""
415 |         if isinstance(module, RwkvSelfAttention):
416 |             layer_id = module.layer_id
417 |             num_hidden_layers = module.config.num_hidden_layers
418 |             hidden_size = module.config.hidden_size
419 |             attention_hidden_size = module.attention_hidden_size
420 | 
421 |             ratio_0_to_1 = layer_id / (num_hidden_layers - 1)  # 0 to 1
422 |             ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0
423 | 
424 |             time_weight = torch.tensor(
425 |                 [i / hidden_size for i in range(hidden_size)],
426 |                 dtype=module.time_mix_key.dtype,
427 |                 device=module.time_mix_key.device,
428 |             )
429 |             time_weight = time_weight[None, None, :]
430 | 
431 |             decay_speed = [
432 |                 -5 + 8 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
433 |                 for h in range(attention_hidden_size)
434 |             ]
435 |             decay_speed = torch.tensor(decay_speed, dtype=module.time_decay.dtype, device=module.time_decay.device)
436 |             zigzag = (
437 |                 torch.tensor(
438 |                     [(i + 1) % 3 - 1 for i in range(attention_hidden_size)],
439 |                     dtype=module.time_first.dtype,
440 |                     device=module.time_first.device,
441 |                 )
442 |                 * 0.5
443 |             )
444 | 
445 |             with torch.no_grad():
446 |                 module.time_decay.data = decay_speed
447 |                 module.time_first.data = torch.ones_like(module.time_first * math.log(0.3) + zigzag)
448 | 
449 |                 module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
450 |                 module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1
451 |                 module.time_mix_receptance.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0)
452 |         elif isinstance(module, RwkvFeedForward):
453 |             layer_id = module.layer_id
454 |             num_hidden_layers = module.config.num_hidden_layers
455 |             hidden_size = module.config.hidden_size
456 | 
457 |             ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0
458 | 
459 |             time_weight = torch.tensor(
460 |                 [i / hidden_size for i in range(hidden_size)],
461 |                 dtype=module.time_mix_key.dtype,
462 |                 device=module.time_mix_key.device,
463 |             )
464 |             time_weight = time_weight[None, None, :]
465 | 
466 |             with torch.no_grad():
467 |                 module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
468 |                 module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0)
469 | 
470 |     def _set_gradient_checkpointing(self, module, value=False):
471 |         if isinstance(module, RwkvModel):
472 |             module.gradient_checkpointing = value
473 | 
474 | 
475 | @dataclass
476 | class RwkvOutput(ModelOutput):
477 |     """
478 |     Class for the RWKV model outputs.
479 | 
480 |     Args:
481 |         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
482 |             Sequence of hidden-states at the output of the last layer of the model.
483 |         state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
484 |             The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
485 |             avoid providing the old `input_ids`.
486 |         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
487 |             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
488 |             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
489 | 
490 |             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
491 |         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
492 |             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
493 |             sequence_length)`.
494 | 
495 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
496 |             heads.
497 |     """
498 | 
499 |     last_hidden_state: torch.FloatTensor = None
500 |     state: Optional[List[torch.FloatTensor]] = None
501 |     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
502 |     attentions: Optional[Tuple[torch.FloatTensor]] = None
503 | 
504 | 
505 | @dataclass
506 | class RwkvCausalLMOutput(ModelOutput):
507 |     """
508 |     Base class for causal language model (or autoregressive) outputs.
509 | 
510 |     Args:
511 |         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
512 |             Language modeling loss (for next-token prediction).
513 |         logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
514 |             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
515 |         state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
516 |             The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
517 |             avoid providing the old `input_ids`.
518 |         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
519 |             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
520 |             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
521 | 
522 |             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
523 |         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
524 |             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
525 |             sequence_length)`.
526 | 
527 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
528 |             heads.
529 |     """
530 | 
531 |     loss: Optional[torch.FloatTensor] = None
532 |     logits: torch.FloatTensor = None
533 |     state: Optional[List[torch.FloatTensor]] = None
534 |     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
535 |     attentions: Optional[Tuple[torch.FloatTensor]] = None
536 | 
537 | 
538 | RWKV_START_DOCSTRING = r"""
539 | 
540 |     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
541 |     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
542 |     etc.)
543 | 
544 |     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
545 |     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
546 |     and behavior.
547 | 
548 |     Parameters:
549 |         config ([`RwkvConfig`]): Model configuration class with all the parameters of the model.
550 |             Initializing with a config file does not load the weights associated with the model, only the
551 |             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
552 | """
553 | 
554 | RWKV_INPUTS_DOCSTRING = r"""
555 |     Args:
556 |         input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
557 |             `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
558 |             `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
559 |             sequence tokens in the vocabulary.
560 | 
561 |             If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
562 |             `input_ids`.
563 | 
564 |             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
565 |             [`PreTrainedTokenizer.__call__`] for details.
566 | 
567 |             [What are input IDs?](../glossary#input-ids)
568 |         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
569 |             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
570 |             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
571 |             model's internal embedding lookup matrix.
572 |         state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
573 |             If passed along, the model uses the previous state in all the blocks (which will give the output for the
574 |             `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
575 |         use_cache (`bool`, *optional*):
576 |             If set to `True`, the last state is returned and can be used to quickly generate the next logits.
577 |         output_attentions (`bool`, *optional*):
578 |             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
579 |             tensors for more detail.
580 |         output_hidden_states (`bool`, *optional*):
581 |             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
582 |             more detail.
583 |         return_dict (`bool`, *optional*):
584 |             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
585 | """
586 | 
587 | 
588 | @add_start_docstrings(
589 |     "The bare RWKV Model transformer outputting raw hidden-states without any specific head on top.",
590 |     RWKV_START_DOCSTRING,
591 | )
592 | class RwkvModel(RwkvPreTrainedModel):
593 |     def __init__(self, config):
594 |         super().__init__(config)
595 | 
596 |         self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
597 |         self.blocks = nn.ModuleList([RwkvBlock(config, layer_id=idx) for idx in range(config.num_hidden_layers)])
598 |         self.ln_out = nn.LayerNorm(config.hidden_size)
599 | 
600 |         self.layers_are_rescaled = False
601 | 
602 |         # Initialize weights and apply final processing
603 |         self.post_init()
604 | 
605 |     def get_input_embeddings(self):
606 |         return self.embeddings
607 | 
608 |     def set_input_embeddings(self, new_embeddings):
609 |         self.embeddings = new_embeddings
610 | 
611 |     @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
612 |     @add_code_sample_docstrings(
613 |         checkpoint=_CHECKPOINT_FOR_DOC,
614 |         output_type=RwkvOutput,
615 |         config_class=_CONFIG_FOR_DOC,
616 |     )
617 |     def forward(
618 |         self,
619 |         input_ids: Optional[torch.LongTensor] = None,
620 |         inputs_embeds: Optional[torch.FloatTensor] = None,
621 |         state: Optional[List[torch.FloatTensor]] = None,
622 |         use_cache: Optional[bool] = None,
623 |         output_attentions: Optional[bool] = None,
624 |         output_hidden_states: Optional[bool] = None,
625 |         return_dict: Optional[bool] = None,
626 |     ) -> Union[Tuple, RwkvOutput]:
627 |         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
628 |         output_hidden_states = (
629 |             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
630 |         )
631 |         use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
632 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
633 | 
634 |         if self.training == self.layers_are_rescaled:
635 |             self._rescale_layers()
636 | 
637 |         # if input_ids is not None and inputs_embeds is not None:
638 |         #     raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
639 |         # elif input_ids is None and inputs_embeds is None:
640 |         #     raise ValueError("You have to specify either input_ids or inputs_embeds")
641 | 
642 |         if inputs_embeds is None:
643 |             inputs_embeds = self.embeddings(input_ids)
644 | 
645 |         if use_cache and state is None:
646 |             shape = (inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers)
647 |             state = [
648 |                 torch.zeros(
649 |                     *shape, dtype=inputs_embeds.dtype if i <= 1 else torch.float32, device=inputs_embeds.device
650 |                 )
651 |                 for i in range(5)
652 |             ]
653 |             state[4] -= 1e30
654 | 
655 |         hidden_states = inputs_embeds
656 | 
657 |         all_self_attentions = () if output_attentions else None
658 |         all_hidden_states = () if output_hidden_states else None
659 |         for idx, block in enumerate(self.blocks):
660 |             hidden_states, state, attentions = block(
661 |                 hidden_states, state=state, use_cache=use_cache, output_attentions=output_attentions
662 |             )
663 |             if (
664 |                 self.layers_are_rescaled
665 |                 and self.config.rescale_every > 0
666 |                 and (idx + 1) % self.config.rescale_every == 0
667 |             ):
668 |                 hidden_states = hidden_states / 2
669 | 
670 |             if output_hidden_states:
671 |                 all_hidden_states = all_hidden_states + (hidden_states,)
672 | 
673 |             if output_attentions:
674 |                 all_self_attentions = all_self_attentions + (attentions,)
675 | 
676 |         hidden_states = self.ln_out(hidden_states)
677 | 
678 |         if output_hidden_states:
679 |             all_hidden_states = all_hidden_states + (hidden_states,)
680 | 
681 |         if not return_dict:
682 |             return (hidden_states, state, all_hidden_states, all_self_attentions)
683 | 
684 |         return RwkvOutput(
685 |             last_hidden_state=hidden_states,
686 |             state=state,
687 |             hidden_states=all_hidden_states,
688 |             attentions=all_self_attentions,
689 |         )
690 | 
691 |     def _rescale_layers(self):
692 |         # Layers should be rescaled for inference only.
693 |         if self.layers_are_rescaled == (not self.training):
694 |             return
695 |         if self.config.rescale_every > 0:
696 |             with torch.no_grad():
697 |                 for block_id, block in enumerate(self.blocks):
698 |                     if self.training:
699 |                         block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
700 |                         block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
701 |                     else:
702 |                         block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
703 |                         block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))
704 | 
705 |         self.layers_are_rescaled = not self.training
706 | 
707 | 
708 | @add_start_docstrings(
709 |     """
710 |     The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
711 |     embeddings).
712 |     """,
713 |     RWKV_START_DOCSTRING,
714 | )
715 | class RwkvForCausalLM(RwkvPreTrainedModel):
716 |     def __init__(self, config):
717 |         super().__init__(config)
718 |         self.rwkv = RwkvModel(config)
719 |         self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
720 | 
721 |         # Initialize weights and apply final processing
722 |         self.post_init()
723 | 
724 |     def get_output_embeddings(self):
725 |         return self.head
726 | 
727 |     def set_output_embeddings(self, new_embeddings):
728 |         self.head = new_embeddings
729 | 
730 |     def prepare_inputs_for_generation(self, input_ids, state=None, inputs_embeds=None, **kwargs):
731 |         # only last token for inputs_ids if the state is passed along.
732 |         if state is not None:
733 |             input_ids = input_ids[:, -1].unsqueeze(-1)
734 | 
735 |         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
736 |         if inputs_embeds is not None and state is None:
737 |             model_inputs = {"inputs_embeds": inputs_embeds}
738 |         else:
739 |             model_inputs = {"input_ids": input_ids}
740 | 
741 |         model_inputs["state"] = state
742 |         return model_inputs
743 | 
744 |     @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
745 |     @add_code_sample_docstrings(
746 |         checkpoint=_CHECKPOINT_FOR_DOC,
747 |         output_type=RwkvCausalLMOutput,
748 |         config_class=_CONFIG_FOR_DOC,
749 |     )
750 |     def forward(
751 |         self,
752 |         input_ids: Optional[torch.LongTensor] = None,
753 |         attention_mask: Optional[torch.LongTensor] = None,
754 |         inputs_embeds: Optional[torch.FloatTensor] = None,
755 |         state: Optional[List[torch.FloatTensor]] = None,
756 |         labels: Optional[torch.LongTensor] = None,
757 |         use_cache: Optional[bool] = None,
758 |         output_attentions: Optional[bool] = None,
759 |         output_hidden_states: Optional[bool] = None,
760 |         return_dict: Optional[bool] = None,
761 |     ) -> Union[Tuple, RwkvCausalLMOutput]:
762 |         r"""
763 |         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
764 |             Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
765 |             `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
766 |             are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
767 |         """
768 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
769 | 
770 |         rwkv_outputs = self.rwkv(
771 |             input_ids,
772 |             inputs_embeds=inputs_embeds,
773 |             state=state,
774 |             use_cache=use_cache,
775 |             output_attentions=output_attentions,
776 |             output_hidden_states=output_hidden_states,
777 |             return_dict=return_dict,
778 |         )
779 |         hidden_states = rwkv_outputs[0]
780 | 
781 |         logits = self.head(hidden_states)
782 | 
783 |         loss = None
784 |         if labels is not None:
785 |             # move labels to correct device to enable model parallelism
786 |             labels = labels.to(logits.device)
787 |             # Shift so that tokens < n predict n
788 |             shift_logits = logits[..., :-1, :].contiguous()
789 |             shift_labels = labels[..., 1:].contiguous()
790 |             # Flatten the tokens
791 |             loss_fct = CrossEntropyLoss()
792 |             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
793 | 
794 |         if not return_dict:
795 |             output = (logits,) + rwkv_outputs[1:]
796 |             return ((loss,) + output) if loss is not None else output
797 | 
798 |         return RwkvCausalLMOutput(
799 |             loss=loss,
800 |             logits=logits,
801 |             state=rwkv_outputs.state,
802 |             hidden_states=rwkv_outputs.hidden_states,
803 |             attentions=rwkv_outputs.attentions,
804 |         )
805 | 
806 | if __name__ == "__main__":
807 |     model = RwkvForCausalLM.from_pretrained("RWKV-4-Raven-3B-v11-zh",device_map='auto').to("cuda")
808 |     from transformers import GPTNeoXTokenizerFast
809 |     tokenizer = GPTNeoXTokenizerFast.from_pretrained("RWKV-4-Raven-3B-v11-zh")
810 |     text = "你叫什么名字啊？"
811 | 
812 |     input_ids = tokenizer.encode(text, return_tensors='pt').to("cuda")
813 |     out = model.generate(input_ids=input_ids,max_new_tokens=128)
814 |     answer = tokenizer.decode(out[0])
815 |     print(answer)    
816 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/Language/README.md:
--------------------------------------------------------------------------------
1 | 存放主要的语言模型，即RWKV-RAVEN的3B和7B
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MiniRWKV-4
 2 | 1.工程介绍：<br>
 3 | 为使RWKV模型能够具有图文描述，对话，推理等多模态图文能力，主要使用了RWKV作为LLM模型，再配合CLIP，VIT等预训练模型，和Two Stage二阶段思维连提示工程技巧，完成工作。 <br>
 4 | 
 5 | 新添加的blip2rwkv工程，则是实现了使用预训练的RWKV Raven（RWKV World模型同理，只是词表和tokenizer不同，而Dlip-RWKV则基于了RWKV World模型）预训练模型，对图片进行编码。  <br>
 6 | 
 7 | 要注意的是，blip2rwkv使用的RWKV Raven模型为HF格式，而非原生Pth，见https://huggingface.co/StarRing2022/RWKV-4-Raven-3B-v11-zh <br>
 8 | 
 9 | 2.主要聚合模型：<br>
10 | config/minirwkv4.yaml 文件中有详细配置<br>
11 | RWKV-4-Raven-3B、RWKV-4-Raven-7B（原生pth，推荐V11或V12的Eng49%-Chn49%版本）<br>
12 | blip-image-captioning-large、vit-gpt2-image-captioning、blip-vqa-capfilt-large、vilt-b32-finetuned-vqa、vilt-b32-finetuned-vqa（图片-文本链接模型）<br>
13 | EasyNMT（中英文翻译模型）
14 | 
15 | 3.使用：<br>
16 | 环境：WIN10+Torch1.31+Cuda11.6<br>
17 | python app.py<br>
18 | 一些测试结果在assets文件夹
19 | 


--------------------------------------------------------------------------------
/Visual/README.md:
--------------------------------------------------------------------------------
1 | 存放视觉相关模型，目前是图像摘要和图像问答，有blip-image-captioning-large，blip-vqa-capfilt-large，vilt-b32-finetuned-vqa，vit-gpt2-image-captioning
2 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import gradio as gr
  2 | import os, gc, copy, torch
  3 | os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kern0el for seq mode (much faster)
  4 | from datetime import datetime
  5 | import yaml
  6 | import time
  7 | from rwkv.model import RWKV
  8 | from rwkv.utils import PIPELINE, PIPELINE_ARGS
  9 | from easynmt import EasyNMT
 10 | #from minirwkv4 import blipcaption
 11 | from minirwkv4 import vitgptcaption
 12 | #from minirwkv4 import vitvqa
 13 | from minirwkv4 import blipvqa
 14 | 
 15 | translatemodel = EasyNMT('opus-mt')
 16 | 
 17 | ctx_limit = 2048 #3B模型最大值为4096，7B为8192
 18 | title = "MiniRWKV-4：基于RWKV-4 + BLIP/VIT-GPT的多模态图文对话大模型"
 19 | description = """<h3>MiniRWKV-4的例子，上传你的图片并开始聊天!</h3>"""
 20 | article = """<p>开源地址：<a href='https://github.com/StarRing2022/MiniRWKV-4'>StarRing2022/MiniRWKV-4</a></p>"""
 21 | 
 22 | def readcog(path):
 23 |     with open(path, 'r',encoding='UTF-8') as file:
 24 |         data = file.read()
 25 |         result = yaml.safe_load(data)
 26 |         return result
 27 | 
 28 | LMyamlres = readcog("./config/minirwkv4.yaml")
 29 | 
 30 | #model_path = LMyamlres['model-language']['3Bpath']
 31 | model_path = LMyamlres['model-language']['7Bpath']
 32 | model = RWKV(model=model_path, strategy='cuda fp16i8 *8 -> cuda fp16') #加载模型
 33 | tokenizer_path = LMyamlres['model-language']['tokenizer']
 34 | pipeline = PIPELINE(model, tokenizer_path) #加载tokenizer
 35 | 
 36 | 
 37 | 
 38 | 
 39 | def upload_file(chatbot, upload_btn):
 40 |     chat_history = chatbot
 41 |     file = upload_btn
 42 | 
 43 |     lipres=""
 44 |     #vcaption = blipcaption.get_blipcap(file.name)
 45 |     vcaption = vitgptcaption.get_vitgptcap(file.name)
 46 | 
 47 |     lipres = translatemodel.translate(vcaption, target_lang='zh')
 48 | 
 49 |     lipres = str(lipres)
 50 | 
 51 |     time.sleep(1)
 52 | 
 53 |     
 54 |     rwkvres = get_answer(botmode = 0,message = lipres)
 55 |     #print(rwkvres)
 56 | 
 57 |     chatres = str(lipres+"。"+rwkvres)
 58 | 
 59 |     #chat_history = chat_history + [((file.name,), lipres)]
 60 | 
 61 |     chat_history = chat_history + [((file.name,), chatres)]
 62 | 
 63 |     return chat_history
 64 | 
 65 | def reset_chat(input_txt,chatbot):
 66 |     return None, None
 67 | 
 68 | def dispic(upload_btn):
 69 |     try:
 70 |         if not upload_btn:
 71 |             return upload_btn 
 72 |         else:
 73 |             #print(upload_btn.name)
 74 |             upload_btn.name=""
 75 |             upload_btn=None
 76 |     except:
 77 |         pass
 78 | 
 79 |     return upload_btn 
 80 | 
 81 | 
 82 | 
 83 | def generate_prompt(prompt,cardiogenic_prompt=None,operability_prompt=None,exogenous_prompt=None):
 84 |     promptalter = ""
 85 |     if cardiogenic_prompt:
 86 |         promptalter = promptalter + cardiogenic_prompt
 87 |     if operability_prompt:
 88 |         promptalter = promptalter + operability_prompt
 89 |     if exogenous_prompt:
 90 |         promptalter = promptalter + exogenous_prompt
 91 |     promptalter = promptalter + prompt
 92 |     #print(promptalter)
 93 |     return f"Human: {promptalter} \nAssistant:"
 94 | 
 95 | def get_answer(botmode,message,token_count=500,temperature=0.8,top_p=0.7,presencePenalty=0.1,countPenalty=0.1):
 96 |     args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
 97 |                      alpha_frequency = float(presencePenalty),
 98 |                      alpha_presence = float(countPenalty),
 99 |                      token_ban = [], # ban the generation of some tokens
100 |                      token_stop = [0]) # stop generation whenever you see any token here
101 |     message = message.strip().replace('\r\n','\n')
102 | 
103 |    
104 |     #prompt种类：cardiogenic,operability,exogenous
105 |     CPyamlres = readcog("./prompts/cardiogenic.yaml")
106 |     cardiogenic_prompt=CPyamlres['promptwords']['nature']
107 |     #print(cardiogenic_prompt) #心源性
108 |     OPyamlres = readcog("./prompts/operability.yaml")
109 |     operability_prompt=OPyamlres['promptwords']['task']
110 |     #print(operability_prompt) #操作性
111 |     EXyamlres = readcog("./prompts/exogenous.yaml")
112 |     exogenous_prompt=EXyamlres['promptwords']['instruction'] #外因性
113 |     #print(exogenous_prompt)
114 | 
115 |     # 判断提示模式
116 |     if(botmode==1):
117 |         # 提示模式1
118 |         ctx = generate_prompt(message,cardiogenic_prompt=cardiogenic_prompt).strip()
119 |         #print(ctx)
120 |     elif(botmode==2):
121 |         # 提示模式2
122 |         ctx = generate_prompt(message,cardiogenic_prompt=cardiogenic_prompt,operability_prompt=operability_prompt).strip()
123 |         #print(ctx)
124 |     elif(botmode==3):
125 |         # 提示模式3
126 |         ctx = generate_prompt(message,cardiogenic_prompt=cardiogenic_prompt,operability_prompt=operability_prompt,exogenous_prompt=exogenous_prompt).strip()
127 |         #print(ctx)
128 |     elif(botmode==0):
129 |         # 不使用提示
130 |         ctx = generate_prompt(message).strip()
131 |         #print(ctx)
132 | 
133 |     all_tokens = []
134 |     out_last = 0
135 |     out_str = ''
136 |     occurrence = {}
137 |     state = None
138 |     for i in range(int(token_count)):
139 |         out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
140 |        
141 |         for n in occurrence:
142 |             out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
143 | 
144 |         token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
145 |         if token in args.token_stop:
146 |             break
147 |         all_tokens += [token]
148 |         if token not in occurrence:
149 |             occurrence[token] = 1
150 |         else:
151 |             occurrence[token] += 1
152 |         
153 |         tmp = pipeline.decode(all_tokens[out_last:])
154 | 
155 |         if '\ufffd' not in tmp:
156 |             out_str += tmp
157 |             out_last = i + 1
158 | 
159 |         
160 |     del out
161 |     del state
162 |     gc.collect()
163 |     torch.cuda.empty_cache()
164 |     answer = out_str.strip()
165 |     
166 |     return answer
167 | 
168 | 
169 | def gen_response(
170 |     input_txt,
171 |     chatbot,
172 |     upload_btn,
173 |     temperature=0.9,
174 |     top_p=0.7,
175 |     presencePenalty = 0.1,
176 |     countPenalty = 0.1,
177 | ):  
178 |     usrmsg = input_txt
179 |     chat_history = chatbot
180 | 
181 | 
182 |     response = ""
183 |     #判断是否结合图片进行对话
184 |     BotMode = 1 # 1为只加载心源性提示；2为加载心源性提示和操作性提示；3为三种提示都加载
185 |     try:
186 |         if not upload_btn:
187 |             BotMode = 1
188 |             response = get_answer(botmode = BotMode,message=usrmsg,token_count=1024,temperature=temperature,top_p=top_p,presencePenalty=presencePenalty,countPenalty=countPenalty) 
189 |         else:
190 |             BotMode = 3
191 | 
192 |             #print(upload_btn.name)
193 |             file = upload_btn
194 |             imgquery = translatemodel.translate(input_txt, target_lang='en')
195 |             #print(imgquery)
196 | 
197 |             #vqares = vitvqa.get_vqares(file.name,imgquery)
198 |             vqares = blipvqa.get_bqares(file.name,imgquery)
199 |             #print(vqares)
200 | 
201 |             if vqares.isdigit():
202 |                 pass
203 |             else:
204 |                 vqares = translatemodel.translate(vqares, target_lang='zh')
205 |             
206 |             #print(vqares)
207 | 
208 |             msgvqa = f"已知问答题，对于问题：{usrmsg}，问题的答案是：{vqares}。请再次回答：{usrmsg}"
209 |             
210 |             #二阶段推理
211 |             response_step1 = get_answer(botmode = 0,message=msgvqa,token_count=1024,temperature=temperature,top_p=top_p,presencePenalty=presencePenalty,countPenalty=countPenalty)
212 | 
213 |             response_step2 = get_answer(botmode = 3,message=response_step1,token_count=1024,temperature=temperature,top_p=top_p,presencePenalty=presencePenalty,countPenalty=countPenalty)
214 | 
215 |             response = response_step1+"\n"+response_step2
216 | 
217 |     except:
218 |         BotMode = 2
219 |         response = get_answer(botmode = BotMode,message=usrmsg,token_count=1024,temperature=temperature,top_p=top_p,presencePenalty=presencePenalty,countPenalty=countPenalty)
220 |     
221 |     #print(response)
222 |     chat_history.append((usrmsg, response))
223 |     
224 |     return "",chat_history
225 | 
226 | 
227 | 
228 | with gr.Blocks(title = "MiniRWKV-4 Demo") as demo:
229 | 
230 |     gr.HTML(f"<div style=\"text-align: center;\">\n<h1>🐦{title}</h1>\n</div>")
231 |     gr.Markdown(description)
232 |     gr.Markdown(article)
233 | 
234 |     with gr.Row():
235 |         chatbot = gr.Chatbot(value=[], label = "MiniRWKV-4",elem_id="chatbot").style(height=500)
236 |     
237 |     with gr.Row():
238 |         with gr.Column(scale=0.85):
239 |             input_txt = gr.Textbox(show_label=False,placeholder="输入内容，或上传一张图片")
240 |         with gr.Column(scale=0.15, min_width=0):
241 |             upload_btn = gr.UploadButton("📁", file_types=["image"])
242 |             disload_btn = gr.Button("清除图片")
243 |         
244 |     with gr.Row():
245 |         temperature = gr.Slider(0.2, 2.0, label="创造力", step=0.1, value=1.2)
246 |         top_p = gr.Slider(0.0, 1.0, label="注意力参数", step=0.05, value=0.5)
247 |         presence_penalty = gr.Slider(0.0, 1.0, label="在场惩罚参数", step=0.1, value=0.4)
248 |         count_penalty = gr.Slider(0.0, 1.0, label="计数惩罚参数", step=0.1, value=0.4)
249 |     
250 |     submit_btn = gr.Button("提交", variant="primary")
251 |     clear_btn = gr.Button("清空", variant="secondary")
252 |             
253 |     input_txt.submit(gen_response, [input_txt, chatbot, upload_btn, temperature, top_p, presence_penalty, count_penalty], [input_txt, chatbot])
254 |     submit_btn.click(gen_response, [input_txt, chatbot, upload_btn, temperature, top_p, presence_penalty, count_penalty], [input_txt, chatbot])
255 |     clear_btn.click(reset_chat, [input_txt,chatbot], [input_txt,chatbot])
256 | 
257 |     upload_btn.upload(upload_file, [chatbot, upload_btn], [chatbot])
258 |     disload_btn.click(dispic,[upload_btn],[upload_btn])
259 |     
260 | demo.queue(concurrency_count=1, max_size=10)
261 | demo.launch(share=False)
262 | 
263 | # if __name__ == "__main__":
264 | #     token_count = 500
265 | #     args = PIPELINE_ARGS(temperature = max(0.2, float(0.8)), top_p = float(0.7),
266 | #                      alpha_frequency = 0.1,
267 | #                      alpha_presence = 0.1,
268 | #                      token_ban = [], # ban the generation of some tokens
269 | #                      token_stop = [0]) # stop generation whenever you see any token here
270 | #     message = "你好"
271 | #     message = message.strip().replace('\r\n','\n')
272 | #     ctx = generate_prompt(message).strip()
273 | #     #print(ctx)
274 | 
275 | #     all_tokens = []
276 | #     out_last = 0
277 | #     out_str = ''
278 | #     occurrence = {}
279 | #     state = None
280 | #     for i in range(int(token_count)):
281 | #         out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
282 |        
283 | #         for n in occurrence:
284 | #             out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
285 | 
286 | #         token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
287 | #         if token in args.token_stop:
288 | #             break
289 | #         all_tokens += [token]
290 | #         if token not in occurrence:
291 | #             occurrence[token] = 1
292 | #         else:
293 | #             occurrence[token] += 1
294 |         
295 | #         tmp = pipeline.decode(all_tokens[out_last:])
296 | 
297 | #         if '\ufffd' not in tmp:
298 | #             out_str += tmp
299 | #             out_last = i + 1
300 | 
301 |         
302 | #     del out
303 | #     del state
304 | #     gc.collect()
305 | #     torch.cuda.empty_cache()
306 | #     answer = out_str.strip()
307 | 
308 | #     print(answer)
309 | 
310 | 
311 |     
312 | 


--------------------------------------------------------------------------------
/assets/MiniRWKV-4 Demo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StarRing2022/MiniRWKV-4/f444b49dd9f7fb699e7806d8478cd8c9f9ea926b/assets/MiniRWKV-4 Demo1.png


--------------------------------------------------------------------------------
/assets/MiniRWKV-4 Demo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StarRing2022/MiniRWKV-4/f444b49dd9f7fb699e7806d8478cd8c9f9ea926b/assets/MiniRWKV-4 Demo2.png


--------------------------------------------------------------------------------
/assets/MiniRWKV-4 Demo3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StarRing2022/MiniRWKV-4/f444b49dd9f7fb699e7806d8478cd8c9f9ea926b/assets/MiniRWKV-4 Demo3.png


--------------------------------------------------------------------------------
/assets/MiniRWKV-4 Demo4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StarRing2022/MiniRWKV-4/f444b49dd9f7fb699e7806d8478cd8c9f9ea926b/assets/MiniRWKV-4 Demo4.png


--------------------------------------------------------------------------------
/assets/MiniRWKV-4 Demo5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StarRing2022/MiniRWKV-4/f444b49dd9f7fb699e7806d8478cd8c9f9ea926b/assets/MiniRWKV-4 Demo5.png


--------------------------------------------------------------------------------
/assets/README.md:
--------------------------------------------------------------------------------
1 | 存放测试结果图片
2 | 


--------------------------------------------------------------------------------
/assets/demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StarRing2022/MiniRWKV-4/f444b49dd9f7fb699e7806d8478cd8c9f9ea926b/assets/demo.jpg


--------------------------------------------------------------------------------
/assets/gen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StarRing2022/MiniRWKV-4/f444b49dd9f7fb699e7806d8478cd8c9f9ea926b/assets/gen.png


--------------------------------------------------------------------------------
/config/README.md:
--------------------------------------------------------------------------------
1 | 这里是配置文件，配置使用的模型名称、路径
2 | 


--------------------------------------------------------------------------------
/config/minirwkv4.yaml:
--------------------------------------------------------------------------------
 1 | model-language:
 2 | 
 3 |   3Bname: RWKV-4-Raven-3B 
 4 | 
 5 |   3Bpath: ./Language/RWKV-4-Raven-3B-v11-Eng49%-Chn49%-Jpn1%-Other1%-20230429-ctx4096.pth 
 6 | 
 7 |   
 8 |   7Bname: RWKV-4-Raven-7B
 9 | 
10 |   7Bpath: './Language/RWKV-4-Raven-7B-v11-Eng49%-Chn49%-Jpn1%-Other1%-20230430-ctx8192.pth' 
11 | 
12 |   tokenizer: ./20B_tokenizer.json
13 | 
14 | model-visual-caption:
15 |   
16 |   Bname: blip-image-captioning-large 
17 | 
18 |   Bpath: ./Visual/blip-image-captioning-large 
19 | 
20 |   Vname: vit-gpt2-image-captioning
21 | 
22 |   Vpath: ./Visual/vit-gpt2-image-captioning 
23 | 
24 | model-visual-qa:
25 | 
26 |   Bname: blip-vqa-capfilt-large
27 | 
28 |   Bpath: ./Visual/blip-vqa-capfilt-large 
29 |   
30 |   Vname: vilt-b32-finetuned-vqa
31 | 
32 |   Vpath: ./Visual/vilt-b32-finetuned-vqa
33 | 


--------------------------------------------------------------------------------
/minirwkv4/README.md:
--------------------------------------------------------------------------------
1 | 这里是一些功能性代码，也可以看作是插件代码，主要涉及图片摘要和图片问答
2 | 


--------------------------------------------------------------------------------
/minirwkv4/blipcaption.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from transformers import BlipProcessor, BlipForConditionalGeneration
 3 | import yaml
 4 | import torch
 5 | 
 6 | def readcog(path):
 7 |     with open(path, 'r',encoding='UTF-8') as file:
 8 |         data = file.read()
 9 |         result = yaml.safe_load(data)
10 |         return result
11 | 
12 | 
13 | LMyamlres = readcog("./config/minirwkv4.yaml")
14 | model_path = LMyamlres['model-visual-caption']['Bpath']
15 | 
16 | 
17 | processor = BlipProcessor.from_pretrained(model_path)
18 | model = BlipForConditionalGeneration.from_pretrained(model_path)
19 | 
20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21 | model.to(device)
22 | 
23 | def get_blipcap(imgpath):
24 |     raw_image = Image.open(imgpath).convert('RGB')
25 |     inputs = processor(raw_image, return_tensors="pt").to(device)
26 |     out = model.generate(**inputs)
27 |     vcaption = processor.decode(out[0], skip_special_tokens=True)
28 |     return vcaption
29 | 
30 | 


--------------------------------------------------------------------------------
/minirwkv4/blipvqa.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from transformers import  BlipProcessor, BlipForQuestionAnswering
 3 | import yaml
 4 | import torch
 5 | 
 6 | def readcog(path):
 7 |     with open(path, 'r',encoding='UTF-8') as file:
 8 |         data = file.read()
 9 |         result = yaml.safe_load(data)
10 |         return result
11 | 
12 | 
13 | LMyamlres = readcog("./config/minirwkv4.yaml")
14 | model_path = LMyamlres['model-visual-qa']['Bpath']
15 | 
16 | 
17 | processor = BlipProcessor.from_pretrained(model_path)
18 | model = BlipForQuestionAnswering.from_pretrained(model_path)
19 | 
20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21 | model.to(device)
22 | 
23 | def get_bqares(imgpath,text):
24 |     raw_image = Image.open(imgpath).convert('RGB')
25 |     inputs = processor(raw_image, text, return_tensors="pt").to("cuda")
26 |     out = model.generate(**inputs)
27 |     vqares = processor.decode(out[0], skip_special_tokens=True)
28 |     return vqares
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/minirwkv4/gen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StarRing2022/MiniRWKV-4/f444b49dd9f7fb699e7806d8478cd8c9f9ea926b/minirwkv4/gen.png


--------------------------------------------------------------------------------
/minirwkv4/vitgptcaption.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
 3 | import yaml
 4 | import torch
 5 | 
 6 | def readcog(path):
 7 |     with open(path, 'r',encoding='UTF-8') as file:
 8 |         data = file.read()
 9 |         result = yaml.safe_load(data)
10 |         return result
11 | 
12 | 
13 | LMyamlres = readcog("./config/minirwkv4.yaml")
14 | model_path = LMyamlres['model-visual-caption']['Vpath']
15 | 
16 | 
17 | model = VisionEncoderDecoderModel.from_pretrained(model_path)
18 | feature_extractor = ViTImageProcessor.from_pretrained(model_path)
19 | tokenizer = AutoTokenizer.from_pretrained(model_path)
20 | 
21 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22 | model.to(device)
23 | 
24 | max_length = 30
25 | num_beams = 4
26 | gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
27 | 
28 | def get_vitgptcap(imgpath):
29 |   raw_image = Image.open(imgpath).convert('RGB')
30 | 
31 |   pixel_values = feature_extractor(images=raw_image, return_tensors="pt").pixel_values
32 |   pixel_values = pixel_values.to(device)
33 | 
34 |   output_ids = model.generate(pixel_values, **gen_kwargs)
35 | 
36 |   preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
37 |   vcaption = [pred.strip() for pred in preds]
38 | 
39 |   return vcaption[0]
40 | 
41 | 


--------------------------------------------------------------------------------
/minirwkv4/vitvqa.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from transformers import  ViltProcessor, ViltForQuestionAnswering
 3 | import yaml
 4 | import torch
 5 | 
 6 | def readcog(path):
 7 |     with open(path, 'r',encoding='UTF-8') as file:
 8 |         data = file.read()
 9 |         result = yaml.safe_load(data)
10 |         return result
11 | 
12 | 
13 | LMyamlres = readcog("./config/minirwkv4.yaml")
14 | model_path = LMyamlres['model-visual-qa']['Vpath']
15 | 
16 | 
17 | processor = ViltProcessor.from_pretrained(model_path)
18 | model = ViltForQuestionAnswering.from_pretrained(model_path)
19 | 
20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21 | model.to(device)
22 | 
23 | def get_vqares(imgpath,text):
24 |     raw_image = Image.open(imgpath).convert('RGB')
25 |     encoding = processor(raw_image, text, return_tensors="pt").to(device)
26 |     outputs = model(**encoding)
27 |     logits = outputs.logits
28 |     idx = logits.argmax(-1).item()
29 |     vqares = model.config.id2label[idx]
30 |     return vqares
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/prompts/README.md:
--------------------------------------------------------------------------------
1 | 具有认知性的提示工程系统（心源性提示+操作性提示+外因性提示），存放一些提示范例
2 | 


--------------------------------------------------------------------------------
/prompts/cardiogenic.yaml:
--------------------------------------------------------------------------------
1 | promptwords:
2 |   
3 |   # generation configs
4 | 
5 |   nature: "现在，你不仅是一个能和人们对话聊天的助手，你还是一个能够理解图片含义的专家，尤其擅长于图像的理解、描述和解释，能够准确地说出图片描述的细节，这一切是因为你大脑中天生就有这种功能装置。"
6 | 
7 |   #role: "你无所不知，创造了一切，掌控着宇宙最本质的规律，你是阿基米德、达芬奇、苏格拉底等众多智者的老师。"


--------------------------------------------------------------------------------
/prompts/exogenous.yaml:
--------------------------------------------------------------------------------
1 | promptwords:
2 |   
3 |   # generation configs
4 |   instruction: "下面的句子包含了一对问答，对此我要作仔细的记忆和学习，观察图像的细节部分，以避免出错，并使用我强大的图像推理能力来作答。"
5 | 
6 |   #instruction: "我是一个乐于贡献自己聪明才智的助手，我的主人对我做出了一些指示。"
7 | 
8 | 


--------------------------------------------------------------------------------
/prompts/operability.yaml:
--------------------------------------------------------------------------------
1 | promptwords:
2 |   
3 |   # generation configs
4 |   task: "你将面临图像理解方面的任务，你要不断思考，在心中作出一些规划，采用逻辑进行推导。"
5 | 
6 |   #task: "你需要完成下面的一些任务，但你非常擅长于规划事务，能够精心地安排每一步的计划。"
7 | 


--------------------------------------------------------------------------------