├── moondream
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-311.pyc
    │   ├── text_model.cpython-311.pyc
    │   └── vision_encoder.cpython-311.pyc
    ├── phi
    │   └── __pycache__
    │   │   ├── modeling_phi.cpython-311.pyc
    │   │   └── configuration_phi.cpython-311.pyc
    ├── util.py
    ├── moondream.py
    ├── configuration_moondream.py
    ├── vision_encoder.py
    └── modeling_phi.py
├── requirements.txt
├── README.md
└── vision.py


/moondream/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import detect_device
2 | from .moondream import Moondream
3 | 


--------------------------------------------------------------------------------
/moondream/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Doriandarko/Moondream2-streamlit/HEAD/moondream/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/moondream/__pycache__/text_model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Doriandarko/Moondream2-streamlit/HEAD/moondream/__pycache__/text_model.cpython-311.pyc


--------------------------------------------------------------------------------
/moondream/__pycache__/vision_encoder.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Doriandarko/Moondream2-streamlit/HEAD/moondream/__pycache__/vision_encoder.cpython-311.pyc


--------------------------------------------------------------------------------
/moondream/phi/__pycache__/modeling_phi.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Doriandarko/Moondream2-streamlit/HEAD/moondream/phi/__pycache__/modeling_phi.cpython-311.pyc


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate==0.25.0
2 | huggingface-hub==0.20.1
3 | Pillow==10.1.0
4 | torch==2.1.2
5 | torchvision==0.16.2
6 | transformers==4.36.2
7 | einops==0.7.0
8 | streamlit


--------------------------------------------------------------------------------
/moondream/phi/__pycache__/configuration_phi.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Doriandarko/Moondream2-streamlit/HEAD/moondream/phi/__pycache__/configuration_phi.cpython-311.pyc


--------------------------------------------------------------------------------
/moondream/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def detect_device():
 5 |     """
 6 |     Detects the appropriate device to run on, and return the device and dtype.
 7 |     """
 8 |     if torch.cuda.is_available():
 9 |         return torch.device("cuda"), torch.float16
10 |     elif torch.backends.mps.is_available():
11 |         return torch.device("mps"), torch.float16
12 |     else:
13 |         return torch.device("cpu"), torch.float32
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Moondream2 Vision Model Streamlit App
 2 | 
 3 | This is a Streamlit app that uses the Moondream2 Vision Model to generate text based on an uploaded image and a user-provided prompt.
 4 | 
 5 | ## Features
 6 | 
 7 | - Upload an image in PNG or JPEG format.
 8 | - Enter a prompt to guide the text generation.
 9 | - Generate text based on the uploaded image and prompt.
10 | 
11 | ## How to Run
12 | 
13 | 1. Install the required Python packages:
14 | 
15 | ```bash
16 | pip install -r requirements.txt
17 | ```
18 | 
19 | 
20 | 2. Run the Streamlit app:
21 | 
22 | ```bash
23 | streamlit run vision.py
24 | ```
25 | 
26 | 3. Open the app in your web browser at `http://localhost:8501`.
27 | 
28 | ## Usage
29 | 
30 | 1. Upload an image using the file uploader.
31 | 2. Enter a prompt in the text input field.
32 | 3. Click the "Generate" button to generate text based on the image and prompt.
33 | 
34 | ## About the Model
35 | 
36 | The Moondream1 Vision Model is a small but powerful vision model that outperforms models twice its size. It was created by [@vikhyatk](https://twitter.com/vikhyatk).
37 | 
38 | ## License
39 | 
40 | This project is open source under the MIT license.
41 | 


--------------------------------------------------------------------------------
/vision.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from PIL import Image
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | import re
 5 | import time
 6 | 
 7 | @st.cache_resource
 8 | def load_model():
 9 |     # Load the model and tokenizer
10 |     model_id = "vikhyatk/moondream2"
11 |     revision = "2024-03-05"
12 |     model = AutoModelForCausalLM.from_pretrained(
13 |         model_id, trust_remote_code=True, revision=revision
14 |     )
15 |     tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
16 |     return model, tokenizer
17 | 
18 | # Load the model and tokenizer
19 | model, tokenizer = load_model()
20 | 
21 | # Streamlit app title
22 | st.title("🌝 Moondream2 Vision Model")
23 | st.write("An enhanced vision model that outperforms its predecessor.")
24 | st.markdown("Model created by [@vikhyatk](https://twitter.com/vikhyatk). App by [@skirano](https://twitter.com/skirano)")
25 | 
26 | # Initialize session state for uploaded image and prompt
27 | if 'uploaded_image' not in st.session_state:
28 |     st.session_state['uploaded_image'] = None
29 | if 'prompt' not in st.session_state:
30 |     st.session_state['prompt'] = ""
31 | 
32 | # File uploader for the image
33 | uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
34 | if uploaded_image is not None:
35 |     st.session_state['uploaded_image'] = uploaded_image
36 | 
37 | # Display the uploaded image
38 | if st.session_state['uploaded_image']:
39 |     image = Image.open(st.session_state['uploaded_image'])
40 |     st.image(image, caption='Uploaded Image.', use_column_width=True)
41 | 
42 | # Text input for the prompt
43 | prompt = st.text_input("Question", value=st.session_state['prompt'])
44 | st.session_state['prompt'] = prompt
45 | 
46 | # Function to generate text from the image and prompt
47 | def generate_text(image, prompt):
48 |     # Placeholder for the output text
49 |     text_placeholder = st.empty()
50 | 
51 |     # Encode the image
52 |     enc_image = model.encode_image(image)
53 | 
54 |     # Generate text
55 |     generated_text = model.answer_question(enc_image, prompt, tokenizer)
56 | 
57 |     # Display the generated text
58 |     text_placeholder.markdown(generated_text)
59 | 
60 | # Button to trigger text generation
61 | if st.button("Generate"):
62 |     if st.session_state['uploaded_image'] is not None and st.session_state['prompt']:
63 |         # Open the uploaded image
64 |         image = Image.open(st.session_state['uploaded_image'])
65 | 
66 |         # Call the generate_text function
67 |         generate_text(image, st.session_state['prompt'])
68 |     else:
69 |         st.warning("Please upload an image and enter a prompt.")


--------------------------------------------------------------------------------
/moondream/moondream.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from .vision_encoder import VisionEncoder
  3 | from .configuration_moondream import MoondreamConfig
  4 | from transformers import PreTrainedModel
  5 | import re
  6 | 
  7 | from .modeling_phi import PhiForCausalLM
  8 | from .configuration_moondream import PhiConfig
  9 | 
 10 | class Moondream(PreTrainedModel):
 11 |     config_class = MoondreamConfig
 12 | 
 13 |     def __init__(self, config):
 14 |         super().__init__(config)
 15 |         self.vision_encoder = VisionEncoder()
 16 | 
 17 |         if type(config.phi_config) == dict:
 18 |             phi_config = PhiConfig(**config.phi_config)
 19 |         else:
 20 |             phi_config = config.phi_config
 21 |         self.text_model = PhiForCausalLM(phi_config)
 22 | 
 23 |     @property
 24 |     def device(self):
 25 |         return self.text_model.device
 26 | 
 27 |     def encode_image(self, image):
 28 |         return self.vision_encoder(image)
 29 | 
 30 |     def input_embeds(self, prompt, image_embeds, tokenizer):
 31 |         def _tokenize(txt):
 32 |             return tokenizer(
 33 |                 txt, return_tensors="pt", add_special_tokens=False
 34 |             ).input_ids.to(self.device)
 35 | 
 36 |         text_emb = self.text_model.get_input_embeddings()
 37 | 
 38 |         # Add BOS token
 39 |         embeds = []
 40 |         embeds.append(
 41 |             text_emb((torch.tensor([[tokenizer.bos_token_id]], device=self.device)))
 42 |         )
 43 | 
 44 |         if "<image>" not in prompt:
 45 |             embeds.append(text_emb(_tokenize(prompt)))
 46 |         else:
 47 |             assert prompt.count("<image>") == 1
 48 |             before, after = prompt.split("<image>")
 49 |             embeds.append(text_emb(_tokenize(f"{before}<image>")))
 50 |             embeds.append(image_embeds.to(self.device))
 51 |             embeds.append(text_emb(_tokenize(f"</image>{after}")))
 52 | 
 53 |         return torch.cat(embeds, dim=1)
 54 | 
 55 |     def generate(
 56 |         self,
 57 |         image_embeds,
 58 |         prompt,
 59 |         tokenizer,
 60 |         eos_text="<END>",
 61 |         max_new_tokens=128,
 62 |         **kwargs,
 63 |     ):
 64 |         eos_tokens = tokenizer(eos_text, add_special_tokens=False)[0].ids
 65 | 
 66 |         generate_config = {
 67 |             "eos_token_id": eos_tokens,
 68 |             "bos_token_id": tokenizer.bos_token_id,
 69 |             "pad_token_id": tokenizer.eos_token_id,
 70 |             "max_new_tokens": max_new_tokens,
 71 |             **kwargs,
 72 |         }
 73 | 
 74 |         with torch.no_grad():
 75 |             inputs_embeds = self.input_embeds(prompt, image_embeds, tokenizer)
 76 |             output_ids = self.text_model.generate(
 77 |                 inputs_embeds=inputs_embeds, **generate_config
 78 |             )
 79 | 
 80 |         return tokenizer.batch_decode(output_ids, skip_special_tokens=True)
 81 | 
 82 |     def answer_question(
 83 |         self,
 84 |         image_embeds,
 85 |         question,
 86 |         tokenizer,
 87 |         chat_history="",
 88 |         result_queue=None,
 89 |         **kwargs,
 90 |     ):
 91 |         prompt = f"<image>\n\n{chat_history}Question: {question}\n\nAnswer: "
 92 |         answer = self.generate(
 93 |             image_embeds,
 94 |             prompt,
 95 |             eos_text="<END>",
 96 |             tokenizer=tokenizer,
 97 |             max_new_tokens=256,
 98 |             **kwargs,
 99 |         )[0]
100 |         cleaned_answer = re.sub("<$|<END$", "", answer).strip()
101 | 
102 |         # Use the result_queue to pass the result if it is provided
103 |         if result_queue:
104 |             result_queue.put(cleaned_answer)
105 |         else:
106 |             return cleaned_answer
107 | 


--------------------------------------------------------------------------------
/moondream/configuration_moondream.py:
--------------------------------------------------------------------------------
  1 | from transformers import PretrainedConfig
  2 | 
  3 | from typing import Optional
  4 | import math
  5 | 
  6 | 
  7 | class PhiConfig(PretrainedConfig):
  8 |     model_type = "phi"
  9 |     keys_to_ignore_at_inference = ["past_key_values"]
 10 | 
 11 |     def __init__(
 12 |         self,
 13 |         vocab_size=51200,
 14 |         hidden_size=2048,
 15 |         intermediate_size=8192,
 16 |         num_hidden_layers=24,
 17 |         num_attention_heads=32,
 18 |         num_key_value_heads=None,
 19 |         resid_pdrop=0.0,
 20 |         embd_pdrop=0.0,
 21 |         attention_dropout=0.0,
 22 |         hidden_act="gelu_new",
 23 |         max_position_embeddings=2048,
 24 |         initializer_range=0.02,
 25 |         layer_norm_eps=1e-5,
 26 |         use_cache=True,
 27 |         tie_word_embeddings=False,
 28 |         rope_theta=10000.0,
 29 |         rope_scaling=None,
 30 |         partial_rotary_factor=0.5,
 31 |         qk_layernorm=False,
 32 |         bos_token_id=1,
 33 |         eos_token_id=2,
 34 |         **kwargs,
 35 |     ):
 36 |         self.vocab_size = vocab_size
 37 |         self.hidden_size = hidden_size
 38 |         self.intermediate_size = intermediate_size
 39 |         self.num_hidden_layers = num_hidden_layers
 40 |         self.num_attention_heads = num_attention_heads
 41 | 
 42 |         if num_key_value_heads is None:
 43 |             num_key_value_heads = num_attention_heads
 44 | 
 45 |         self.num_key_value_heads = num_key_value_heads
 46 |         self.resid_pdrop = resid_pdrop
 47 |         self.embd_pdrop = embd_pdrop
 48 |         self.attention_dropout = attention_dropout
 49 |         self.hidden_act = hidden_act
 50 |         self.max_position_embeddings = max_position_embeddings
 51 |         self.initializer_range = initializer_range
 52 |         self.layer_norm_eps = layer_norm_eps
 53 |         self.use_cache = use_cache
 54 |         self.rope_theta = rope_theta
 55 |         self.rope_scaling = rope_scaling
 56 |         self.partial_rotary_factor = partial_rotary_factor
 57 |         self.qk_layernorm = qk_layernorm
 58 |         self._rope_scaling_validation()
 59 | 
 60 |         super().__init__(
 61 |             bos_token_id=bos_token_id,
 62 |             eos_token_id=eos_token_id,
 63 |             tie_word_embeddings=tie_word_embeddings,
 64 |             **kwargs,
 65 |         )
 66 | 
 67 |     # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
 68 |     def _rope_scaling_validation(self):
 69 |         """
 70 |         Validate the `rope_scaling` configuration.
 71 |         """
 72 |         if self.rope_scaling is None:
 73 |             return
 74 | 
 75 |         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
 76 |             raise ValueError(
 77 |                 "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
 78 |                 f"got {self.rope_scaling}"
 79 |             )
 80 |         rope_scaling_type = self.rope_scaling.get("type", None)
 81 |         rope_scaling_factor = self.rope_scaling.get("factor", None)
 82 |         if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
 83 |             raise ValueError(
 84 |                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
 85 |             )
 86 |         if (
 87 |             rope_scaling_factor is None
 88 |             or not isinstance(rope_scaling_factor, float)
 89 |             or rope_scaling_factor <= 1.0
 90 |         ):
 91 |             raise ValueError(
 92 |                 f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}"
 93 |             )
 94 | 
 95 | 
 96 | class MoondreamConfig(PretrainedConfig):
 97 |     model_type = "moondream1"
 98 | 
 99 |     def __init__(self, **kwargs):
100 |         self.phi_config = PhiConfig(**kwargs)
101 |         super().__init__(**kwargs)
102 | 


--------------------------------------------------------------------------------
/moondream/vision_encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from PIL import Image
  4 | from einops import rearrange
  5 | from torchvision.transforms.v2 import (
  6 |     Compose,
  7 |     Resize,
  8 |     InterpolationMode,
  9 |     ToImage,
 10 |     ToDtype,
 11 |     Normalize,
 12 | )
 13 | import timm
 14 | 
 15 | 
 16 | class VisualHolder(nn.Module):
 17 |     def __init__(self, model):
 18 |         super().__init__()
 19 |         self.visual = model
 20 | 
 21 |     def forward(self, x):
 22 |         return self.visual(x)
 23 | 
 24 | 
 25 | class ModelHolder(nn.Module):
 26 |     def __init__(self, model):
 27 |         super().__init__()
 28 |         self.model = model
 29 | 
 30 |     def forward(self, x):
 31 |         return self.model(x)
 32 | 
 33 | 
 34 | class LinearPatchEmbedding(nn.Module):
 35 |     def __init__(self, conv):
 36 |         super().__init__()
 37 |         self.linear = nn.Linear(588, 1152)
 38 |         self.linear.weight.data = conv.weight.data.view(1152, -1)
 39 |         if conv.bias is not None:
 40 |             self.linear.bias.data = conv.bias.data
 41 | 
 42 |     def forward(self, x):
 43 |         return self.linear(x)
 44 | 
 45 | 
 46 | class MLP(nn.Module):
 47 |     def __init__(
 48 |         self,
 49 |         in_features: int,
 50 |         hidden_features: int = None,
 51 |         out_features: int = None,
 52 |         act_layer: nn.Module = nn.GELU,
 53 |     ) -> None:
 54 |         super().__init__()
 55 |         out_features = out_features or in_features
 56 |         hidden_features = hidden_features or in_features
 57 |         self.fc1 = nn.Linear(in_features, hidden_features)
 58 |         self.act = act_layer()
 59 |         self.fc2 = nn.Linear(hidden_features, out_features)
 60 | 
 61 |         torch.nn.init.kaiming_normal_(
 62 |             self.fc1.weight, mode="fan_in", nonlinearity="relu"
 63 |         )
 64 |         torch.nn.init.kaiming_normal_(
 65 |             self.fc2.weight, mode="fan_in", nonlinearity="relu"
 66 |         )
 67 | 
 68 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 69 |         x = self.fc1(x)
 70 |         x = self.act(x)
 71 |         x = self.fc2(x)
 72 |         return x
 73 | 
 74 | 
 75 | class VisionProjection(nn.Module):
 76 |     def __init__(self):
 77 |         super().__init__()
 78 | 
 79 |         image_embedding_dim = 1152
 80 |         model_dim = 2048
 81 |         hidden_dim = model_dim * 4
 82 | 
 83 |         self.mlp = MLP(image_embedding_dim, hidden_dim, model_dim)
 84 | 
 85 |     @property
 86 |     def device(self):
 87 |         return self.mlp.fc1.weight.device
 88 | 
 89 |     def forward(self, x):
 90 |         return self.mlp(x)
 91 | 
 92 | 
 93 | class VisionEncoder(nn.Module):
 94 |     def __init__(self) -> None:
 95 |         super().__init__()
 96 | 
 97 |         self.encoder = ModelHolder(
 98 |             VisualHolder(timm.create_model("vit_so400m_patch14_siglip_384"))
 99 |         )
100 |         self.encoder.model.visual.patch_embed = LinearPatchEmbedding(
101 |             self.encoder.model.visual.patch_embed.proj
102 |         )
103 |         self.encoder.model.visual.attn_pool = nn.Identity()
104 | 
105 |         self.projection = VisionProjection()
106 | 
107 |         self.preprocess = Compose(
108 |             [
109 |                 Resize(size=(378, 378), interpolation=InterpolationMode.BICUBIC),
110 |                 ToImage(),
111 |                 ToDtype(torch.float32, scale=True),
112 |                 Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
113 |             ]
114 |         )
115 | 
116 |     @property
117 |     def device(self):
118 |         return self.projection.mlp.fc1.weight.device
119 | 
120 |     @property
121 |     def dtype(self):
122 |         return self.projection.mlp.fc1.weight.dtype
123 | 
124 |     def __call__(self, image: Image) -> torch.Tensor:
125 |         with torch.no_grad():
126 |             x = (
127 |                 self.preprocess(image.convert("RGB"))
128 |                 .unsqueeze(0)
129 |                 .to(self.device, dtype=self.dtype)
130 |             )
131 |             x = rearrange(x, "b c (h p1) (w p2) -> b (h w) (c p1 p2)", p1=14, p2=14)
132 | 
133 |             x = self.encoder(x)
134 |             x = self.projection(x)
135 | 
136 |             return x
137 | 


--------------------------------------------------------------------------------
/moondream/modeling_phi.py:
--------------------------------------------------------------------------------
   1 | # coding=utf-8
   2 | # Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved.
   3 | #
   4 | # Licensed under the Apache License, Version 2.0 (the "License");
   5 | # you may not use this file except in compliance with the License.
   6 | # You may obtain a copy of the License at
   7 | #
   8 | #     http://www.apache.org/licenses/LICENSE-2.0
   9 | #
  10 | # Unless required by applicable law or agreed to in writing, software
  11 | # distributed under the License is distributed on an "AS IS" BASIS,
  12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 | # See the License for the specific language governing permissions and
  14 | # limitations under the License.
  15 | 
  16 | """ PyTorch Phi model."""
  17 | 
  18 | 
  19 | import math
  20 | from typing import List, Optional, Tuple, Union
  21 | 
  22 | import torch
  23 | import torch.nn.functional as F
  24 | import torch.utils.checkpoint
  25 | from torch import nn
  26 | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
  27 | 
  28 | from transformers.activations import ACT2FN
  29 | from transformers.cache_utils import Cache, DynamicCache
  30 | from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
  31 | from transformers.modeling_outputs import (
  32 |     BaseModelOutputWithPast,
  33 |     CausalLMOutputWithPast,
  34 |     SequenceClassifierOutputWithPast,
  35 | )
  36 | from transformers.modeling_utils import PreTrainedModel
  37 | from transformers.utils import (
  38 |     is_flash_attn_2_available,
  39 |     is_flash_attn_greater_or_equal_2_10,
  40 |     logging,
  41 | )
  42 | from .configuration_moondream import PhiConfig
  43 | 
  44 | 
  45 | try:  # noqa: SIM105
  46 |     if is_flash_attn_2_available():
  47 |         from flash_attn import flash_attn_func, flash_attn_varlen_func
  48 |         from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
  49 | except ImportError:
  50 |     # Workaround for https://github.com/huggingface/transformers/issues/28459,
  51 |     # don't move to contextlib.suppress(ImportError)
  52 |     pass
  53 | 
  54 | 
  55 | logger = logging.get_logger(__name__)
  56 | 
  57 | 
  58 | # Copied from transformers.models.llama.modeling_llama._get_unpad_data
  59 | def _get_unpad_data(attention_mask):
  60 |     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
  61 |     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
  62 |     max_seqlen_in_batch = seqlens_in_batch.max().item()
  63 |     cu_seqlens = F.pad(
  64 |         torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
  65 |     )
  66 |     return (
  67 |         indices,
  68 |         cu_seqlens,
  69 |         max_seqlen_in_batch,
  70 |     )
  71 | 
  72 | 
  73 | # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Phi
  74 | class PhiRotaryEmbedding(nn.Module):
  75 |     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
  76 |         super().__init__()
  77 | 
  78 |         self.dim = dim
  79 |         self.max_position_embeddings = max_position_embeddings
  80 |         self.base = base
  81 |         inv_freq = 1.0 / (
  82 |             self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
  83 |         )
  84 |         self.register_buffer("inv_freq", inv_freq, persistent=False)
  85 | 
  86 |         # Build here to make `torch.jit.trace` work.
  87 |         self._set_cos_sin_cache(
  88 |             seq_len=max_position_embeddings,
  89 |             device=self.inv_freq.device,
  90 |             dtype=torch.get_default_dtype(),
  91 |         )
  92 | 
  93 |     def _set_cos_sin_cache(self, seq_len, device, dtype):
  94 |         self.max_seq_len_cached = seq_len
  95 |         t = torch.arange(
  96 |             self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
  97 |         )
  98 | 
  99 |         freqs = torch.outer(t, self.inv_freq)
 100 |         # Different from paper, but it uses a different permutation in order to obtain the same calculation
 101 |         emb = torch.cat((freqs, freqs), dim=-1)
 102 |         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
 103 |         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 104 | 
 105 |     def forward(self, x, seq_len=None):
 106 |         # x: [bs, num_attention_heads, seq_len, head_size]
 107 |         if seq_len > self.max_seq_len_cached:
 108 |             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
 109 | 
 110 |         return (
 111 |             self.cos_cached[:seq_len].to(dtype=x.dtype),
 112 |             self.sin_cached[:seq_len].to(dtype=x.dtype),
 113 |         )
 114 | 
 115 | 
 116 | # Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Phi
 117 | class PhiLinearScalingRotaryEmbedding(PhiRotaryEmbedding):
 118 |     """PhiRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
 119 | 
 120 |     def __init__(
 121 |         self,
 122 |         dim,
 123 |         max_position_embeddings=2048,
 124 |         base=10000,
 125 |         device=None,
 126 |         scaling_factor=1.0,
 127 |     ):
 128 |         self.scaling_factor = scaling_factor
 129 |         super().__init__(dim, max_position_embeddings, base, device)
 130 | 
 131 |     def _set_cos_sin_cache(self, seq_len, device, dtype):
 132 |         self.max_seq_len_cached = seq_len
 133 |         t = torch.arange(
 134 |             self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
 135 |         )
 136 |         t = t / self.scaling_factor
 137 | 
 138 |         freqs = torch.outer(t, self.inv_freq)
 139 |         # Different from paper, but it uses a different permutation in order to obtain the same calculation
 140 |         emb = torch.cat((freqs, freqs), dim=-1)
 141 |         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
 142 |         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 143 | 
 144 | 
 145 | # Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Phi
 146 | class PhiDynamicNTKScalingRotaryEmbedding(PhiRotaryEmbedding):
 147 |     """PhiRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
 148 | 
 149 |     def __init__(
 150 |         self,
 151 |         dim,
 152 |         max_position_embeddings=2048,
 153 |         base=10000,
 154 |         device=None,
 155 |         scaling_factor=1.0,
 156 |     ):
 157 |         self.scaling_factor = scaling_factor
 158 |         super().__init__(dim, max_position_embeddings, base, device)
 159 | 
 160 |     def _set_cos_sin_cache(self, seq_len, device, dtype):
 161 |         self.max_seq_len_cached = seq_len
 162 | 
 163 |         if seq_len > self.max_position_embeddings:
 164 |             base = self.base * (
 165 |                 (self.scaling_factor * seq_len / self.max_position_embeddings)
 166 |                 - (self.scaling_factor - 1)
 167 |             ) ** (self.dim / (self.dim - 2))
 168 |             inv_freq = 1.0 / (
 169 |                 base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
 170 |             )
 171 |             self.register_buffer("inv_freq", inv_freq, persistent=False)
 172 | 
 173 |         t = torch.arange(
 174 |             self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
 175 |         )
 176 | 
 177 |         freqs = torch.outer(t, self.inv_freq)
 178 |         # Different from paper, but it uses a different permutation in order to obtain the same calculation
 179 |         emb = torch.cat((freqs, freqs), dim=-1)
 180 |         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
 181 |         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 182 | 
 183 | 
 184 | # Copied from transformers.models.llama.modeling_llama.rotate_half
 185 | def rotate_half(x):
 186 |     """Rotates half the hidden dims of the input."""
 187 |     x1 = x[..., : x.shape[-1] // 2]
 188 |     x2 = x[..., x.shape[-1] // 2 :]
 189 |     return torch.cat((-x2, x1), dim=-1)
 190 | 
 191 | 
 192 | # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 193 | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
 194 |     """Applies Rotary Position Embedding to the query and key tensors.
 195 | 
 196 |     Args:
 197 |         q (`torch.Tensor`): The query tensor.
 198 |         k (`torch.Tensor`): The key tensor.
 199 |         cos (`torch.Tensor`): The cosine part of the rotary embedding.
 200 |         sin (`torch.Tensor`): The sine part of the rotary embedding.
 201 |         position_ids (`torch.Tensor`):
 202 |             The position indices of the tokens corresponding to the query and key tensors. For example, this can be
 203 |             used to pass offsetted position ids when working with a KV-cache.
 204 |         unsqueeze_dim (`int`, *optional*, defaults to 1):
 205 |             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
 206 |             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
 207 |             that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
 208 |             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
 209 |             cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
 210 |             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
 211 |     Returns:
 212 |         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
 213 |     """
 214 |     cos = cos[position_ids].unsqueeze(unsqueeze_dim)
 215 |     sin = sin[position_ids].unsqueeze(unsqueeze_dim)
 216 |     q_embed = (q * cos) + (rotate_half(q) * sin)
 217 |     k_embed = (k * cos) + (rotate_half(k) * sin)
 218 |     return q_embed, k_embed
 219 | 
 220 | 
 221 | # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Phi
 222 | class PhiMLP(nn.Module):
 223 |     def __init__(self, config):
 224 |         super().__init__()
 225 |         self.config = config
 226 |         self.activation_fn = ACT2FN[config.hidden_act]
 227 |         self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
 228 |         self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
 229 | 
 230 |     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 231 |         hidden_states = self.fc1(hidden_states)
 232 |         hidden_states = self.activation_fn(hidden_states)
 233 |         hidden_states = self.fc2(hidden_states)
 234 |         return hidden_states
 235 | 
 236 | 
 237 | # Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
 238 | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 239 |     """
 240 |     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
 241 |     num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
 242 |     """
 243 |     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
 244 |     if n_rep == 1:
 245 |         return hidden_states
 246 |     hidden_states = hidden_states[:, :, None, :, :].expand(
 247 |         batch, num_key_value_heads, n_rep, slen, head_dim
 248 |     )
 249 |     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 250 | 
 251 | 
 252 | class PhiAttention(nn.Module):
 253 |     """Multi-headed attention from 'Attention Is All You Need' paper"""
 254 | 
 255 |     def __init__(self, config: PhiConfig, layer_idx: Optional[int] = None):
 256 |         super().__init__()
 257 |         self.config = config
 258 |         self.layer_idx = layer_idx
 259 |         if layer_idx is None:
 260 |             logger.warning_once(
 261 |                 f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
 262 |                 "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
 263 |                 "when creating this class."
 264 |             )
 265 | 
 266 |         self.attention_dropout = config.attention_dropout
 267 |         self.hidden_size = config.hidden_size
 268 |         self.num_heads = config.num_attention_heads
 269 |         self.head_dim = self.hidden_size // self.num_heads
 270 |         self.num_key_value_heads = config.num_key_value_heads
 271 |         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
 272 |         self.max_position_embeddings = config.max_position_embeddings
 273 |         self.rope_theta = config.rope_theta
 274 |         self.partial_rotary_factor = config.partial_rotary_factor
 275 |         self.is_causal = True
 276 | 
 277 |         if (self.head_dim * self.num_heads) != self.hidden_size:
 278 |             raise ValueError(
 279 |                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
 280 |                 f" and `num_heads`: {self.num_heads})."
 281 |             )
 282 | 
 283 |         self.Wqkv = nn.Linear(
 284 |             self.hidden_size, 3 * self.num_heads * self.head_dim, bias=True
 285 |         )
 286 |         self.out_proj = nn.Linear(
 287 |             self.num_heads * self.head_dim, self.hidden_size, bias=True
 288 |         )
 289 | 
 290 |         self.qk_layernorm = config.qk_layernorm
 291 |         if self.qk_layernorm:
 292 |             self.q_layernorm = nn.LayerNorm(
 293 |                 config.hidden_size // self.num_heads,
 294 |                 eps=config.layer_norm_eps,
 295 |                 elementwise_affine=True,
 296 |             )
 297 |             self.k_layernorm = nn.LayerNorm(
 298 |                 config.hidden_size // self.num_heads,
 299 |                 eps=config.layer_norm_eps,
 300 |                 elementwise_affine=True,
 301 |             )
 302 | 
 303 |         self._init_rope()
 304 | 
 305 |     def _init_rope(self):
 306 |         if self.config.rope_scaling is None:
 307 |             self.rotary_emb = PhiRotaryEmbedding(
 308 |                 int(self.partial_rotary_factor * self.head_dim),
 309 |                 max_position_embeddings=self.max_position_embeddings,
 310 |                 base=self.rope_theta,
 311 |             )
 312 |         else:
 313 |             scaling_type = self.config.rope_scaling["type"]
 314 |             scaling_factor = self.config.rope_scaling["factor"]
 315 |             if scaling_type == "linear":
 316 |                 self.rotary_emb = PhiLinearScalingRotaryEmbedding(
 317 |                     int(self.partial_rotary_factor * self.head_dim),
 318 |                     max_position_embeddings=self.max_position_embeddings,
 319 |                     scaling_factor=scaling_factor,
 320 |                     base=self.rope_theta,
 321 |                 )
 322 |             elif scaling_type == "dynamic":
 323 |                 self.rotary_emb = PhiDynamicNTKScalingRotaryEmbedding(
 324 |                     int(self.partial_rotary_factor * self.head_dim),
 325 |                     max_position_embeddings=self.max_position_embeddings,
 326 |                     scaling_factor=scaling_factor,
 327 |                     base=self.rope_theta,
 328 |                 )
 329 |             else:
 330 |                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
 331 | 
 332 |     def forward(
 333 |         self,
 334 |         hidden_states: torch.Tensor,
 335 |         attention_mask: Optional[torch.Tensor] = None,
 336 |         position_ids: Optional[torch.LongTensor] = None,
 337 |         past_key_value: Optional[Cache] = None,
 338 |         output_attentions: bool = False,
 339 |         use_cache: bool = False,
 340 |     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 341 |         bsz, q_len, _ = hidden_states.size()
 342 | 
 343 |         query_states, key_states, value_states = self.Wqkv(hidden_states).chunk(
 344 |             3, dim=-1
 345 |         )
 346 | 
 347 |         if self.qk_layernorm:
 348 |             query_states = self.q_layernorm(query_states)
 349 |             key_states = self.k_layernorm(key_states)
 350 | 
 351 |         query_states = query_states.view(
 352 |             bsz, q_len, self.num_heads, self.head_dim
 353 |         ).transpose(1, 2)
 354 |         key_states = key_states.view(
 355 |             bsz, q_len, self.num_key_value_heads, self.head_dim
 356 |         ).transpose(1, 2)
 357 |         value_states = value_states.view(
 358 |             bsz, q_len, self.num_key_value_heads, self.head_dim
 359 |         ).transpose(1, 2)
 360 | 
 361 |         kv_seq_len = key_states.shape[-2]
 362 |         if past_key_value is not None:
 363 |             if self.layer_idx is None:
 364 |                 raise ValueError(
 365 |                     f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
 366 |                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
 367 |                     "with a layer index."
 368 |                 )
 369 |             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
 370 |         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 371 | 
 372 |         # Partial rotary embedding
 373 |         query_rot, query_pass = (
 374 |             query_states[..., : self.rotary_emb.dim],
 375 |             query_states[..., self.rotary_emb.dim :],
 376 |         )
 377 |         key_rot, key_pass = (
 378 |             key_states[..., : self.rotary_emb.dim],
 379 |             key_states[..., self.rotary_emb.dim :],
 380 |         )
 381 |         # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
 382 |         query_rot, key_rot = apply_rotary_pos_emb(
 383 |             query_rot, key_rot, cos, sin, position_ids
 384 |         )
 385 | 
 386 |         # [batch_size, seq_length, num_heads, head_dim]
 387 |         query_states = torch.cat((query_rot, query_pass), dim=-1)
 388 |         key_states = torch.cat((key_rot, key_pass), dim=-1)
 389 | 
 390 |         if past_key_value is not None:
 391 |             cache_kwargs = {
 392 |                 "sin": sin,
 393 |                 "cos": cos,
 394 |                 "partial_rotation_size": self.rotary_emb.dim,
 395 |             }
 396 |             key_states, value_states = past_key_value.update(
 397 |                 key_states, value_states, self.layer_idx, cache_kwargs
 398 |             )
 399 | 
 400 |         key_states = repeat_kv(key_states, self.num_key_value_groups)
 401 |         value_states = repeat_kv(value_states, self.num_key_value_groups)
 402 | 
 403 |         # Queries and keys upcast to fp32 is required by Phi-2 to avoid overflow
 404 |         attn_weights = torch.matmul(
 405 |             query_states.to(torch.float32), key_states.to(torch.float32).transpose(2, 3)
 406 |         ) / math.sqrt(self.head_dim)
 407 | 
 408 |         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
 409 |             raise ValueError(
 410 |                 f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
 411 |                 f" {attn_weights.size()}"
 412 |             )
 413 | 
 414 |         if attention_mask is not None:
 415 |             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
 416 |                 raise ValueError(
 417 |                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
 418 |                 )
 419 |             attn_weights = attn_weights + attention_mask
 420 | 
 421 |         # upcast attention to fp32
 422 |         attn_weights = nn.functional.softmax(
 423 |             attn_weights, dim=-1, dtype=torch.float32
 424 |         ).to(value_states.dtype)
 425 |         attn_weights = nn.functional.dropout(
 426 |             attn_weights, p=self.attention_dropout, training=self.training
 427 |         )
 428 | 
 429 |         attn_output = torch.matmul(attn_weights, value_states)
 430 | 
 431 |         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
 432 |             raise ValueError(
 433 |                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
 434 |                 f" {attn_output.size()}"
 435 |             )
 436 | 
 437 |         attn_output = attn_output.transpose(1, 2).contiguous()
 438 |         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 439 | 
 440 |         attn_output = self.out_proj(attn_output)
 441 | 
 442 |         if not output_attentions:
 443 |             attn_weights = None
 444 | 
 445 |         return attn_output, attn_weights, past_key_value
 446 | 
 447 | 
 448 | class PhiFlashAttention2(PhiAttention):
 449 |     """
 450 |     Phi flash attention module. This module inherits from `PhiAttention` as the weights of the module stays
 451 |     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
 452 |     flash attention and deal with padding tokens in case the input contains any of them.
 453 |     """
 454 | 
 455 |     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
 456 |     def __init__(self, *args, **kwargs):
 457 |         super().__init__(*args, **kwargs)
 458 | 
 459 |         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
 460 |         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
 461 |         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
 462 |         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
 463 | 
 464 |     def forward(
 465 |         self,
 466 |         hidden_states: torch.Tensor,
 467 |         attention_mask: Optional[torch.LongTensor] = None,
 468 |         position_ids: Optional[torch.LongTensor] = None,
 469 |         past_key_value: Optional[Cache] = None,
 470 |         output_attentions: bool = False,
 471 |         use_cache: bool = False,
 472 |         **kwargs,
 473 |     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 474 |         # PhiFlashAttention2 attention does not support output_attentions
 475 | 
 476 |         output_attentions = False
 477 | 
 478 |         bsz, q_len, _ = hidden_states.size()
 479 | 
 480 |         query_states, key_states, value_states = self.Wqkv(hidden_states).chunk(
 481 |             3, dim=-1
 482 |         )
 483 | 
 484 |         if self.qk_layernorm:
 485 |             query_states = self.q_layernorm(query_states)
 486 |             key_states = self.k_layernorm(key_states)
 487 | 
 488 |         # Flash attention requires the input to have the shape
 489 |         # batch_size x seq_length x head_dim x hidden_dim
 490 |         # therefore we just need to keep the original shape
 491 |         query_states = query_states.view(
 492 |             bsz, q_len, self.num_heads, self.head_dim
 493 |         ).transpose(1, 2)
 494 |         key_states = key_states.view(
 495 |             bsz, q_len, self.num_key_value_heads, self.head_dim
 496 |         ).transpose(1, 2)
 497 |         value_states = value_states.view(
 498 |             bsz, q_len, self.num_key_value_heads, self.head_dim
 499 |         ).transpose(1, 2)
 500 | 
 501 |         kv_seq_len = key_states.shape[-2]
 502 |         if past_key_value is not None:
 503 |             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
 504 |         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 505 | 
 506 |         # Partial rotary embedding
 507 |         query_rot, query_pass = (
 508 |             query_states[..., : self.rotary_emb.dim],
 509 |             query_states[..., self.rotary_emb.dim :],
 510 |         )
 511 |         key_rot, key_pass = (
 512 |             key_states[..., : self.rotary_emb.dim],
 513 |             key_states[..., self.rotary_emb.dim :],
 514 |         )
 515 |         # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
 516 |         query_rot, key_rot = apply_rotary_pos_emb(
 517 |             query_rot, key_rot, cos, sin, position_ids
 518 |         )
 519 | 
 520 |         # [batch_size, seq_length, num_heads, head_dim]
 521 |         query_states = torch.cat((query_rot, query_pass), dim=-1)
 522 |         key_states = torch.cat((key_rot, key_pass), dim=-1)
 523 | 
 524 |         if past_key_value is not None:
 525 |             cache_kwargs = {
 526 |                 "sin": sin,
 527 |                 "cos": cos,
 528 |                 "partial_rotation_size": self.rotary_emb.dim,
 529 |             }
 530 |             key_states, value_states = past_key_value.update(
 531 |                 key_states, value_states, self.layer_idx, cache_kwargs
 532 |             )
 533 | 
 534 |         # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
 535 |         # to be able to avoid many of these transpose/reshape/view.
 536 |         query_states = query_states.transpose(1, 2)
 537 |         key_states = key_states.transpose(1, 2)
 538 |         value_states = value_states.transpose(1, 2)
 539 | 
 540 |         attn_dropout = self.attention_dropout if self.training else 0.0
 541 | 
 542 |         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
 543 |         # therefore the input hidden states gets silently casted in float32. Hence, we need
 544 |         # cast them back in the correct dtype just to be sure everything works as expected.
 545 |         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
 546 |         # in fp32.
 547 | 
 548 |         if query_states.dtype == torch.float32:
 549 |             if torch.is_autocast_enabled():
 550 |                 target_dtype = torch.get_autocast_gpu_dtype()
 551 |             # Handle the case where the model is quantized
 552 |             elif hasattr(self.config, "_pre_quantization_dtype"):
 553 |                 target_dtype = self.config._pre_quantization_dtype
 554 |             else:
 555 |                 target_dtype = self.q_proj.weight.dtype
 556 | 
 557 |             logger.warning_once(
 558 |                 f"The input hidden states seems to be silently casted in float32, this might be related to"
 559 |                 f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
 560 |                 f" {target_dtype}."
 561 |             )
 562 | 
 563 |             query_states = query_states.to(target_dtype)
 564 |             key_states = key_states.to(target_dtype)
 565 |             value_states = value_states.to(target_dtype)
 566 | 
 567 |         attn_output = self._flash_attention_forward(
 568 |             query_states,
 569 |             key_states,
 570 |             value_states,
 571 |             attention_mask,
 572 |             q_len,
 573 |             dropout=attn_dropout,
 574 |             softmax_scale=None,
 575 |         )
 576 | 
 577 |         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
 578 |         attn_output = self.out_proj(attn_output)
 579 | 
 580 |         if not output_attentions:
 581 |             attn_weights = None
 582 | 
 583 |         return attn_output, attn_weights, past_key_value
 584 | 
 585 |     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
 586 |     def _flash_attention_forward(
 587 |         self,
 588 |         query_states,
 589 |         key_states,
 590 |         value_states,
 591 |         attention_mask,
 592 |         query_length,
 593 |         dropout=0.0,
 594 |         softmax_scale=None,
 595 |     ):
 596 |         """
 597 |         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
 598 |         first unpad the input, then computes the attention scores and pad the final attention scores.
 599 | 
 600 |         Args:
 601 |             query_states (`torch.Tensor`):
 602 |                 Input query states to be passed to Flash Attention API
 603 |             key_states (`torch.Tensor`):
 604 |                 Input key states to be passed to Flash Attention API
 605 |             value_states (`torch.Tensor`):
 606 |                 Input value states to be passed to Flash Attention API
 607 |             attention_mask (`torch.Tensor`):
 608 |                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
 609 |                 position of padding tokens and 1 for the position of non-padding tokens.
 610 |             dropout (`int`, *optional*):
 611 |                 Attention dropout
 612 |             softmax_scale (`float`, *optional*):
 613 |                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
 614 |         """
 615 |         if not self._flash_attn_uses_top_left_mask:
 616 |             causal = self.is_causal
 617 |         else:
 618 |             # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
 619 |             causal = self.is_causal and query_length != 1
 620 | 
 621 |         # Contains at least one padding token in the sequence
 622 |         if attention_mask is not None:
 623 |             batch_size = query_states.shape[0]
 624 |             (
 625 |                 query_states,
 626 |                 key_states,
 627 |                 value_states,
 628 |                 indices_q,
 629 |                 cu_seq_lens,
 630 |                 max_seq_lens,
 631 |             ) = self._upad_input(
 632 |                 query_states, key_states, value_states, attention_mask, query_length
 633 |             )
 634 | 
 635 |             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
 636 |             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
 637 | 
 638 |             attn_output_unpad = flash_attn_varlen_func(
 639 |                 query_states,
 640 |                 key_states,
 641 |                 value_states,
 642 |                 cu_seqlens_q=cu_seqlens_q,
 643 |                 cu_seqlens_k=cu_seqlens_k,
 644 |                 max_seqlen_q=max_seqlen_in_batch_q,
 645 |                 max_seqlen_k=max_seqlen_in_batch_k,
 646 |                 dropout_p=dropout,
 647 |                 softmax_scale=softmax_scale,
 648 |                 causal=causal,
 649 |             )
 650 | 
 651 |             attn_output = pad_input(
 652 |                 attn_output_unpad, indices_q, batch_size, query_length
 653 |             )
 654 |         else:
 655 |             attn_output = flash_attn_func(
 656 |                 query_states,
 657 |                 key_states,
 658 |                 value_states,
 659 |                 dropout,
 660 |                 softmax_scale=softmax_scale,
 661 |                 causal=causal,
 662 |             )
 663 | 
 664 |         return attn_output
 665 | 
 666 |     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
 667 |     def _upad_input(
 668 |         self, query_layer, key_layer, value_layer, attention_mask, query_length
 669 |     ):
 670 |         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
 671 |         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 672 | 
 673 |         key_layer = index_first_axis(
 674 |             key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
 675 |             indices_k,
 676 |         )
 677 |         value_layer = index_first_axis(
 678 |             value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
 679 |             indices_k,
 680 |         )
 681 |         if query_length == kv_seq_len:
 682 |             query_layer = index_first_axis(
 683 |                 query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
 684 |                 indices_k,
 685 |             )
 686 |             cu_seqlens_q = cu_seqlens_k
 687 |             max_seqlen_in_batch_q = max_seqlen_in_batch_k
 688 |             indices_q = indices_k
 689 |         elif query_length == 1:
 690 |             max_seqlen_in_batch_q = 1
 691 |             cu_seqlens_q = torch.arange(
 692 |                 batch_size + 1, dtype=torch.int32, device=query_layer.device
 693 |             )  # There is a memcpy here, that is very bad.
 694 |             indices_q = cu_seqlens_q[:-1]
 695 |             query_layer = query_layer.squeeze(1)
 696 |         else:
 697 |             # The -q_len: slice assumes left padding.
 698 |             attention_mask = attention_mask[:, -query_length:]
 699 |             query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
 700 |                 query_layer, attention_mask
 701 |             )
 702 | 
 703 |         return (
 704 |             query_layer,
 705 |             key_layer,
 706 |             value_layer,
 707 |             indices_q,
 708 |             (cu_seqlens_q, cu_seqlens_k),
 709 |             (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
 710 |         )
 711 | 
 712 | 
 713 | PHI_ATTENTION_CLASSES = {
 714 |     "eager": PhiAttention,
 715 |     "flash_attention_2": PhiFlashAttention2,
 716 | }
 717 | 
 718 | 
 719 | class PhiDecoderLayer(nn.Module):
 720 |     def __init__(self, config: PhiConfig, layer_idx: int):
 721 |         super().__init__()
 722 |         self.mixer = PHI_ATTENTION_CLASSES[config._attn_implementation](
 723 |             config, layer_idx=layer_idx
 724 |         )
 725 |         self.mlp = PhiMLP(config)
 726 |         self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 727 |         self.resid_dropout = nn.Dropout(config.resid_pdrop)
 728 | 
 729 |     def forward(
 730 |         self,
 731 |         hidden_states: torch.Tensor,
 732 |         attention_mask: Optional[torch.Tensor] = None,
 733 |         position_ids: Optional[torch.LongTensor] = None,
 734 |         output_attentions: Optional[bool] = False,
 735 |         use_cache: Optional[bool] = False,
 736 |         past_key_value: Optional[Tuple[torch.Tensor]] = None,
 737 |     ) -> Tuple[
 738 |         torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
 739 |     ]:
 740 |         """
 741 |         Args:
 742 |             hidden_states (`torch.FloatTensor`):
 743 |                 input to the layer of shape `(batch, seq_len, embed_dim)`
 744 |             attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
 745 |                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
 746 |             position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
 747 |                 Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
 748 |                 `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
 749 |             output_attentions (`bool`, *optional*):
 750 |                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
 751 |                 returned tensors for more detail.
 752 |             use_cache (`bool`, *optional*):
 753 |                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
 754 |                 (see `past_key_values`).
 755 |             past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
 756 |         """
 757 | 
 758 |         residual = hidden_states
 759 | 
 760 |         hidden_states = self.ln(hidden_states)
 761 | 
 762 |         # Self Attention
 763 |         attn_outputs, self_attn_weights, present_key_value = self.mixer(
 764 |             hidden_states=hidden_states,
 765 |             attention_mask=attention_mask,
 766 |             position_ids=position_ids,
 767 |             past_key_value=past_key_value,
 768 |             output_attentions=output_attentions,
 769 |             use_cache=use_cache,
 770 |         )
 771 |         attn_outputs = self.resid_dropout(attn_outputs)
 772 | 
 773 |         feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
 774 |         hidden_states = attn_outputs + feed_forward_hidden_states + residual
 775 |         outputs = (hidden_states,)
 776 | 
 777 |         if output_attentions:
 778 |             outputs += (self_attn_weights,)
 779 | 
 780 |         if use_cache:
 781 |             outputs += (present_key_value,)
 782 | 
 783 |         return outputs
 784 | 
 785 | 
 786 | class PhiPreTrainedModel(PreTrainedModel):
 787 |     config_class = PhiConfig
 788 |     base_model_prefix = "model"
 789 |     supports_gradient_checkpointing = True
 790 |     _no_split_modules = ["PhiDecoderLayer"]
 791 |     _skip_keys_device_placement = "past_key_values"
 792 |     _supports_flash_attn_2 = True
 793 |     _supports_cache_class = True
 794 | 
 795 |     def _init_weights(self, module):
 796 |         std = self.config.initializer_range
 797 |         if isinstance(module, nn.Linear):
 798 |             module.weight.data.normal_(mean=0.0, std=std)
 799 |             if module.bias is not None:
 800 |                 module.bias.data.zero_()
 801 |         elif isinstance(module, nn.Embedding):
 802 |             module.weight.data.normal_(mean=0.0, std=std)
 803 |             if module.padding_idx is not None:
 804 |                 module.weight.data[module.padding_idx].zero_()
 805 | 
 806 | 
 807 | class Embedding(nn.Module):
 808 |     def __init__(self, config: PhiConfig):
 809 |         super().__init__()
 810 |         self.wte = nn.Embedding(
 811 |             config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
 812 |         )
 813 | 
 814 |     def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
 815 |         return self.wte(input_ids)
 816 | 
 817 | 
 818 | class PhiModel(PhiPreTrainedModel):
 819 |     """
 820 |     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PhiDecoderLayer`]
 821 | 
 822 |     Args:
 823 |         config: PhiConfig
 824 |     """
 825 | 
 826 |     def __init__(self, config: PhiConfig):
 827 |         super().__init__(config)
 828 |         self.padding_idx = config.pad_token_id
 829 |         self.vocab_size = config.vocab_size
 830 | 
 831 |         self.embd = Embedding(config)
 832 |         self.embed_dropout = nn.Dropout(config.embd_pdrop)
 833 |         self.h = nn.ModuleList(
 834 |             [
 835 |                 PhiDecoderLayer(config, layer_idx)
 836 |                 for layer_idx in range(config.num_hidden_layers)
 837 |             ]
 838 |         )
 839 |         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 840 | 
 841 |         self.gradient_checkpointing = False
 842 |         # Initialize weights and apply final processing
 843 |         self.post_init()
 844 | 
 845 |     def get_input_embeddings(self):
 846 |         return self.embd.wte
 847 | 
 848 |     def set_input_embeddings(self, value):
 849 |         self.embd.wte = value
 850 | 
 851 |     def forward(
 852 |         self,
 853 |         input_ids: torch.LongTensor = None,
 854 |         attention_mask: Optional[torch.Tensor] = None,
 855 |         position_ids: Optional[torch.LongTensor] = None,
 856 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 857 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 858 |         use_cache: Optional[bool] = None,
 859 |         output_attentions: Optional[bool] = None,
 860 |         output_hidden_states: Optional[bool] = None,
 861 |         return_dict: Optional[bool] = None,
 862 |     ) -> Union[Tuple, BaseModelOutputWithPast]:
 863 |         output_attentions = (
 864 |             output_attentions
 865 |             if output_attentions is not None
 866 |             else self.config.output_attentions
 867 |         )
 868 |         output_hidden_states = (
 869 |             output_hidden_states
 870 |             if output_hidden_states is not None
 871 |             else self.config.output_hidden_states
 872 |         )
 873 |         use_cache = use_cache if use_cache is not None else self.config.use_cache
 874 | 
 875 |         return_dict = (
 876 |             return_dict if return_dict is not None else self.config.use_return_dict
 877 |         )
 878 | 
 879 |         # retrieve input_ids and inputs_embeds
 880 |         if input_ids is not None and inputs_embeds is not None:
 881 |             raise ValueError(
 882 |                 "You cannot specify both input_ids and inputs_embeds at the same time"
 883 |             )
 884 |         elif input_ids is not None:
 885 |             batch_size, seq_length = input_ids.shape[:2]
 886 |         elif inputs_embeds is not None:
 887 |             batch_size, seq_length = inputs_embeds.shape[:2]
 888 |         else:
 889 |             raise ValueError("You have to specify either input_ids or inputs_embeds")
 890 | 
 891 |         past_key_values_length = 0
 892 | 
 893 |         if self.gradient_checkpointing and self.training:
 894 |             if use_cache:
 895 |                 logger.warning_once(
 896 |                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
 897 |                 )
 898 |                 use_cache = False
 899 | 
 900 |         if use_cache:
 901 |             use_legacy_cache = not isinstance(past_key_values, Cache)
 902 |             if use_legacy_cache:
 903 |                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
 904 |             past_key_values_length = past_key_values.get_usable_length(seq_length)
 905 | 
 906 |         if position_ids is None:
 907 |             device = input_ids.device if input_ids is not None else inputs_embeds.device
 908 |             position_ids = torch.arange(
 909 |                 past_key_values_length,
 910 |                 seq_length + past_key_values_length,
 911 |                 dtype=torch.long,
 912 |                 device=device,
 913 |             )
 914 |             position_ids = position_ids.unsqueeze(0)
 915 | 
 916 |         if inputs_embeds is None:
 917 |             inputs_embeds = self.embd(input_ids)
 918 | 
 919 |         inputs_embeds = self.embed_dropout(inputs_embeds)
 920 | 
 921 |         # Attention mask.
 922 |         if self._use_flash_attention_2:
 923 |             # 2d mask is passed through the layers
 924 |             attention_mask = (
 925 |                 attention_mask
 926 |                 if (attention_mask is not None and 0 in attention_mask)
 927 |                 else None
 928 |             )
 929 |         else:
 930 |             # 4d mask is passed through the layers
 931 |             attention_mask = _prepare_4d_causal_attention_mask(
 932 |                 attention_mask,
 933 |                 (batch_size, seq_length),
 934 |                 inputs_embeds,
 935 |                 past_key_values_length,
 936 |             )
 937 | 
 938 |         hidden_states = inputs_embeds
 939 | 
 940 |         # decoder layers
 941 |         all_hidden_states = () if output_hidden_states else None
 942 |         all_self_attns = () if output_attentions else None
 943 |         next_decoder_cache = None
 944 | 
 945 |         for decoder_layer in self.h:
 946 |             if output_hidden_states:
 947 |                 all_hidden_states += (hidden_states,)
 948 | 
 949 |             if self.gradient_checkpointing and self.training:
 950 |                 layer_outputs = self._gradient_checkpointing_func(
 951 |                     decoder_layer.__call__,
 952 |                     hidden_states,
 953 |                     attention_mask,
 954 |                     position_ids,
 955 |                     past_key_values,
 956 |                     output_attentions,
 957 |                 )
 958 |             else:
 959 |                 layer_outputs = decoder_layer(
 960 |                     hidden_states,
 961 |                     attention_mask=attention_mask,
 962 |                     position_ids=position_ids,
 963 |                     past_key_value=past_key_values,
 964 |                     output_attentions=output_attentions,
 965 |                     use_cache=use_cache,
 966 |                 )
 967 | 
 968 |             hidden_states = layer_outputs[0]
 969 | 
 970 |             if use_cache:
 971 |                 next_decoder_cache = layer_outputs[2 if output_attentions else 1]
 972 | 
 973 |             if output_attentions:
 974 |                 all_self_attns += (layer_outputs[1],)
 975 | 
 976 |         # add hidden states from the last decoder layer
 977 |         if output_hidden_states:
 978 |             all_hidden_states += (hidden_states,)
 979 | 
 980 |         next_cache = None
 981 |         if use_cache:
 982 |             next_cache = (
 983 |                 next_decoder_cache.to_legacy_cache()
 984 |                 if use_legacy_cache
 985 |                 else next_decoder_cache
 986 |             )
 987 |         if not return_dict:
 988 |             return tuple(
 989 |                 v
 990 |                 for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
 991 |                 if v is not None
 992 |             )
 993 |         return BaseModelOutputWithPast(
 994 |             last_hidden_state=hidden_states,
 995 |             past_key_values=next_cache,
 996 |             hidden_states=all_hidden_states,
 997 |             attentions=all_self_attns,
 998 |         )
 999 | 
1000 | 
1001 | class CausalLMHead(nn.Module):
1002 |     """Causal Language Modeling head. Simplified version."""
1003 | 
1004 |     def __init__(self, config):
1005 |         super().__init__()
1006 |         self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
1007 |         self.linear = nn.Linear(config.hidden_size, config.vocab_size)
1008 | 
1009 |     def forward(self, hidden_states):
1010 |         return self.linear(self.ln(hidden_states))
1011 | 
1012 | 
1013 | class PhiForCausalLM(PhiPreTrainedModel):
1014 |     _tied_weights_keys = ["lm_head.linear.weight"]
1015 | 
1016 |     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi,bias=False->bias=True
1017 |     def __init__(self, config):
1018 |         super().__init__(config)
1019 |         self.transformer = PhiModel(config)
1020 |         self.vocab_size = config.vocab_size
1021 |         self.lm_head = CausalLMHead(config)
1022 | 
1023 |         # Initialize weights and apply final processing
1024 |         self.post_init()
1025 | 
1026 |     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
1027 |     def get_input_embeddings(self):
1028 |         return self.transformer.embd.wte
1029 | 
1030 |     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
1031 |     def set_input_embeddings(self, value):
1032 |         self.model.embd.wte = value
1033 | 
1034 |     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
1035 |     def get_output_embeddings(self):
1036 |         return self.lm_head.linear
1037 | 
1038 |     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
1039 |     def set_output_embeddings(self, new_embeddings):
1040 |         self.lm_head.linear = new_embeddings
1041 | 
1042 |     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
1043 |     def set_decoder(self, decoder):
1044 |         self.model = decoder
1045 | 
1046 |     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
1047 |     def get_decoder(self):
1048 |         return self.model
1049 | 
1050 |     def forward(
1051 |         self,
1052 |         input_ids: torch.LongTensor = None,
1053 |         attention_mask: Optional[torch.Tensor] = None,
1054 |         position_ids: Optional[torch.LongTensor] = None,
1055 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
1056 |         inputs_embeds: Optional[torch.FloatTensor] = None,
1057 |         labels: Optional[torch.LongTensor] = None,
1058 |         use_cache: Optional[bool] = None,
1059 |         output_attentions: Optional[bool] = None,
1060 |         output_hidden_states: Optional[bool] = None,
1061 |         return_dict: Optional[bool] = None,
1062 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
1063 |         r"""
1064 |         Args:
1065 |             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1066 |                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1067 |                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1068 |                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1069 | 
1070 |         Returns:
1071 | 
1072 |         Example:
1073 | 
1074 |         ```python
1075 |         >>> from transformers import AutoTokenizer, PhiForCausalLM
1076 | 
1077 |         >>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1")
1078 |         >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
1079 | 
1080 |         >>> prompt = "This is an example script ."
1081 |         >>> inputs = tokenizer(prompt, return_tensors="pt")
1082 | 
1083 |         >>> # Generate
1084 |         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1085 |         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1086 |         'This is an example script .\n\n\n\nfrom typing import List\n\ndef find_most_common_letter(words: List[str'
1087 |         ```"""
1088 | 
1089 |         output_attentions = (
1090 |             output_attentions
1091 |             if output_attentions is not None
1092 |             else self.config.output_attentions
1093 |         )
1094 |         output_hidden_states = (
1095 |             output_hidden_states
1096 |             if output_hidden_states is not None
1097 |             else self.config.output_hidden_states
1098 |         )
1099 |         return_dict = (
1100 |             return_dict if return_dict is not None else self.config.use_return_dict
1101 |         )
1102 | 
1103 |         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1104 |         outputs = self.transformer(
1105 |             input_ids=input_ids,
1106 |             attention_mask=attention_mask,
1107 |             position_ids=position_ids,
1108 |             past_key_values=past_key_values,
1109 |             inputs_embeds=inputs_embeds,
1110 |             use_cache=use_cache,
1111 |             output_attentions=output_attentions,
1112 |             output_hidden_states=output_hidden_states,
1113 |             return_dict=return_dict,
1114 |         )
1115 | 
1116 |         hidden_states = outputs[0]
1117 |         logits = self.lm_head(hidden_states)
1118 |         logits = logits.float()
1119 | 
1120 |         loss = None
1121 |         if labels is not None:
1122 |             # Shift so that tokens < n predict n
1123 |             shift_logits = logits[..., :-1, :].contiguous()
1124 |             shift_labels = labels[..., 1:].contiguous()
1125 |             # Flatten the tokens
1126 |             loss_fct = CrossEntropyLoss()
1127 |             shift_logits = shift_logits.view(-1, self.config.vocab_size)
1128 |             shift_labels = shift_labels.view(-1)
1129 |             # Enable model parallelism
1130 |             shift_labels = shift_labels.to(shift_logits.device)
1131 |             loss = loss_fct(shift_logits, shift_labels)
1132 | 
1133 |         if not return_dict:
1134 |             output = (logits,) + outputs[1:]
1135 |             return (loss,) + output if loss is not None else output
1136 | 
1137 |         return CausalLMOutputWithPast(
1138 |             loss=loss,
1139 |             logits=logits,
1140 |             past_key_values=outputs.past_key_values,
1141 |             hidden_states=outputs.hidden_states,
1142 |             attentions=outputs.attentions,
1143 |         )
1144 | 
1145 |     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
1146 |     def prepare_inputs_for_generation(
1147 |         self,
1148 |         input_ids,
1149 |         past_key_values=None,
1150 |         attention_mask=None,
1151 |         inputs_embeds=None,
1152 |         **kwargs,
1153 |     ):
1154 |         if past_key_values is not None:
1155 |             if isinstance(past_key_values, Cache):
1156 |                 cache_length = past_key_values.get_seq_length()
1157 |                 past_length = past_key_values.seen_tokens
1158 |                 max_cache_length = past_key_values.get_max_length()
1159 |             else:
1160 |                 cache_length = past_length = past_key_values[0][0].shape[2]
1161 |                 max_cache_length = None
1162 | 
1163 |             # Keep only the unprocessed tokens:
1164 |             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
1165 |             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
1166 |             # input)
1167 |             if (
1168 |                 attention_mask is not None
1169 |                 and attention_mask.shape[1] > input_ids.shape[1]
1170 |             ):
1171 |                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
1172 |             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
1173 |             # input_ids based on the past_length.
1174 |             elif past_length < input_ids.shape[1]:
1175 |                 input_ids = input_ids[:, past_length:]
1176 |             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
1177 | 
1178 |             # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
1179 |             if (
1180 |                 max_cache_length is not None
1181 |                 and attention_mask is not None
1182 |                 and cache_length + input_ids.shape[1] > max_cache_length
1183 |             ):
1184 |                 attention_mask = attention_mask[:, -max_cache_length:]
1185 | 
1186 |         position_ids = kwargs.get("position_ids", None)
1187 |         if attention_mask is not None and position_ids is None:
1188 |             # create position_ids on the fly for batch generation
1189 |             position_ids = attention_mask.long().cumsum(-1) - 1
1190 |             position_ids.masked_fill_(attention_mask == 0, 1)
1191 |             if past_key_values:
1192 |                 position_ids = position_ids[:, -input_ids.shape[1] :]
1193 | 
1194 |         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1195 |         if inputs_embeds is not None and past_key_values is None:
1196 |             model_inputs = {"inputs_embeds": inputs_embeds}
1197 |         else:
1198 |             model_inputs = {"input_ids": input_ids}
1199 | 
1200 |         model_inputs.update(
1201 |             {
1202 |                 "position_ids": position_ids,
1203 |                 "past_key_values": past_key_values,
1204 |                 "use_cache": kwargs.get("use_cache"),
1205 |                 "attention_mask": attention_mask,
1206 |             }
1207 |         )
1208 |         return model_inputs
1209 | 
1210 |     @staticmethod
1211 |     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache
1212 |     def _reorder_cache(past_key_values, beam_idx):
1213 |         reordered_past = ()
1214 |         for layer_past in past_key_values:
1215 |             reordered_past += (
1216 |                 tuple(
1217 |                     past_state.index_select(0, beam_idx.to(past_state.device))
1218 |                     for past_state in layer_past
1219 |                 ),
1220 |             )
1221 |         return reordered_past
1222 | 


--------------------------------------------------------------------------------