
     `ia                     "   d dl mZmZmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(  e"j)        e*          Z+d Z,d6dZ-dej.        de/dej.        fdZ0	 d7dej1        dej.        dej.        dej.        deej.                 d e2d!e2d"ee         fd#Z3 G d$ d%ej1                  Z4 ed&           G d' d(ej1                              Z5 G d) d*ej1                  Z6 G d+ d,e          Z7e  G d- d.e                      Z8 G d/ d0ej1                  Z9e  G d1 d2e8                      Z:e  G d3 d4e8e                      Z;g d5Z<dS )8    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )GraniteConfigc                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1x2s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/granite/modeling_granite.pyrotate_halfr*   .   s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''    c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer*   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r)   apply_rotary_pos_embr6   5   sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr+   hidden_statesn_repreturnc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r#   expandreshape)r7   r8   batchnum_key_value_headsslenhead_dims         r)   	repeat_kvrA   P   s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr+           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr    r   r   )r"   dtype)ptrainingr   )rA   num_key_value_groupsr$   matmul	transposer#   r   
functionalsoftmaxfloat32torM   rI   rO   
contiguous)rC   rD   rE   rF   rG   rH   rI   rJ   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r)   eager_attention_forwardr]   \   s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r+   c                   "    e Zd ZdZddedee         f fdZ eddd	          	 	 dd
e	j
        dee	j
        e	j
        f         dee	j
                 dee         dee	j                 dee         dee	j
        e	j
        f         fd            Z xZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    t                                                       || _        || _        t	          |d|j        |j        z            | _        |j        |j        z  | _	        |j
        | _        |j        | _        d| _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        | j        z  |j        |j                  | _        d S )Nr@   Tbias)super__init__r`   ra   getattrhidden_sizenum_attention_headsr@   r>   rP   attention_multiplierrH   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_projselfr`   ra   	__class__s      r)   rf   zGraniteAttention.__init__y   s>   "
F4F&Jd4dee$*$>&B\$\!2!'!9i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i&68JQWQf
 
 
r+   past_key_valuepast_key_values4.58new_nameversionr7   position_embeddingsrG   cache_positionrJ   r9   c                 D   |j         d d         }g |d| j        R }|                     |                              |                              dd          }	|                     |                              |                              dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|                    |
|| j	        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        sdn| j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )Nr   r   r    )r1   r0   r}   eagerrB   )rI   rH   )r#   r@   ro   viewrR   rp   rq   r6   updatera   r]   r`   _attn_implementationr   rO   rk   rH   r<   rW   rr   )rt   r7   r|   rG   rw   r}   rJ   input_shapehidden_shapequery_statesrX   rY   r0   r1   cache_kwargsattention_interfacer\   rZ   s                     r)   forwardzGraniteAttention.forward   s    $)#2#.88b8$-88{{=1166|DDNNqRSTT[[//44\BBLLQPQRR
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r+   N)NN)__name__
__module____qualname____doc__r   r   intrf   r   r$   Tensortupler	   
LongTensorr   r   r   __classcell__ru   s   @r)   r_   r_   v   s       GG
 
} 
# 
 
 
 
 
 
. _%0A6RRR ,059)) ))|)) #5<#=>)) !.	))
 "%)) !!12)) +,)) 
u|U\)	*)) )) )) SR)) )) )) )) ))r+   r_   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )GraniteRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z=
        GraniteRMSNorm is equivalent to T5LayerNorm
        N)re   rf   r   	Parameterr$   onesweightvariance_epsilon)rt   rh   epsru   s      r)   rf   zGraniteRMSNorm.__init__   sD     	l5:k#:#:;; #r+   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )Nr    r   T)keepdim)	rM   rV   r$   rU   powmeanrsqrtr   r   )rt   r7   input_dtypevariances       r)   r   zGraniteRMSNorm.forward   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r+   c                 H    t          | j        j                   d| j         S )Nz, eps=)r   r   r#   r   )rt   s    r)   
extra_reprzGraniteRMSNorm.extra_repr   s&    )**II$2GIIIr+   )r   )r   r   r   rf   r   r   r   r   s   @r)   r   r      sb        $ $ $ $ $ $; ; ;J J J J J J Jr+   r   c                   $     e Zd Z fdZd Z xZS )
GraniteMLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _	        t          j        | j        | j        |j                  | _
        t          |j                 | _        d S )Nrc   )re   rf   r`   rh   intermediate_sizer   rm   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnrt   r`   ru   s     r)   rf   zGraniteMLP.__init__   s    !-!'!94#3T5KRXRabbby!143IPVP_```4#94;KRXRabbbV./r+   c                     |                      |                     |                     |                    |                     |          z            }|S r   )r   r   r   r   )rt   r&   r   s      r)   r   zGraniteMLP.forward   sA    NN4;;t~~a/@/@#A#ADLLQROO#STT	r+   )r   r   r   rf   r   r   r   s   @r)   r   r      sG        0 0 0 0 0      r+   r   c                   v    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	e         de	ej
                 de	eej        ej        f                  deej        e	eej        ej        f                  f         fd            Z xZS )GraniteDecoderLayerr`   ra   c                 L   t                                                       |j        | _        t          ||          | _        t          |          | _        t          |j        |j                  | _	        t          |j        |j                  | _
        |j        | _        d S )N)r`   ra   r   )re   rf   rh   r_   	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernormresidual_multiplierrs   s      r)   rf   zGraniteDecoderLayer.__init__   s    !-)9MMMf%%-f.@fFYZZZ(6v7IvOb(c(c(c%#)#=   r+   rv   rw   rx   ry   NFr7   rG   r2   output_attentions	use_cacher}   r|   r9   c	                    |}
|                      |          } | j        d||||||||d|	\  }}|
|| j        z  z   }|}
|                     |          }|                     |          }|
|| j        z  z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r7   rG   r2   rw   r   r   r}   r|    )r   r   r   r   r   )rt   r7   rG   r2   rw   r   r   r}   r|   rJ   residualself_attn_weightsoutputss                r)   r   zGraniteDecoderLayer.forward   s    F !,,];; ,:4> 
,
')%+/) 3
,
 
,
 
,
 
,
(( !=43K#KK !55mDD// =43K#KK " 	,)++Gr+   )NNNFFNN)r   r   r   r   r   rf   r   r$   r   r   r   r	   boolr   FloatTensorr   r   r   s   @r)   r   r      sN       >} > > > > > > > _%0A6RRR 2637+/,1$)59KO? ?|? !.? u/0	?
 "%? $D>? D>? !!12? &eEL%,,F&GH? 
u (51BEDU1U+V"WW	X? ? ? SR? ? ? ? ?r+   r   c                   L    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )GranitePreTrainedModelr`   modelTr   rw   )r7   
attentionsN)r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r_   _can_record_outputsr   r+   r)   r   r   0  sl         &*#./#4"5N!"&,& r+   r   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )GraniteRotaryEmbeddinginv_freqNr`   c                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)re   rf   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr`   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)rt   r`   devicer   ru   s       r)   rf   zGraniteRotaryEmbedding.__init__F  s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r+   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   r   r   mpscpuF)device_typeenabledr    r!   )rM   )r   floatr;   r#   rV   r   r   r   strr$   autocastrR   r%   r0   r   r1   rM   )
rt   r&   r2   inv_freq_expandedposition_ids_expandedr   freqsembr0   r1   s
             r)   r   zGraniteRotaryEmbedding.forwardW  s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/r   )r   r   r   r$   r   r   r   rf   no_gradr   r   r   r   s   @r)   r   r   C  s         l/ /} / / / / / /" U]__< <  _< < < < <r+   r   c                   6    e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 dee         deej                 d	ee         d
ee         dee         deej	                 dee         defd                        Z xZS )GraniteModelr`   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        t!                    | _        d| _        j        | _        |                                  d S )Nc                 0    g | ]}t          |          S r   )r   ).0ra   r`   s     r)   
<listcomp>z)GraniteModel.__init__.<locals>.<listcomp>p  s$    eee	 33eeer+   r   r`   F)re   rf   pad_token_idpadding_idx
vocab_sizer   	Embeddingrh   embed_tokens
ModuleListrangenum_hidden_layerslayersr   r   normr   
rotary_embgradient_checkpointingembedding_multiplier	post_initr   s    `r)   rf   zGraniteModel.__init__i  s       !. +L):F<NPTP`aameeeeU6KcEdEdeee
 
 #6#56;NOOO	0???&+#$*$?! 	r+   N	input_idsrG   r2   rw   inputs_embedsr   r   output_hidden_statesr}   rJ   r9   c
                    ||n| j         j        }||n| j         j        }||n| j         j        }|d u |d uz  rt	          d          | j        r%| j        r|rt                              d           d}|| 	                    |          }|| j
        z  }|r|t          | j                   }|	B||                                nd}t          j        |||j        d         z   |j                  }	||	                    d          }t%          | j         |||	||          }|}|                     ||          }|rd	nd }|rd	nd }| j        d | j         j                 D ]1}|r||fz  } ||f||||||	|d
|
}|d         }|r||d         fz  }2|                     |          }|r||fz  }t/          ||r|nd ||          S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   r   )r   )r`   input_embedsrG   r}   rw   r2   r   )rG   r2   rw   r   r   r}   r|   )last_hidden_staterw   r7   r   )r`   r   r  r   
ValueErrorr	  rO   loggerwarning_oncer  r
  r
   get_seq_lengthr$   aranger#   r   r-   r   r  r  r  r  r   )rt   r  rG   r2   rw   r  r   r   r  r}   rJ   past_seen_tokensr[   r7   r|   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r)   r   zGraniteModel.forwardz  s    2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	-t";< 	[YZZZ& 	4= 	Y 	j   I  --i88M%(AA 	?0*$+>>>O!CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L(;&))+%
 
 
 & #oom\JJ #7@BBD0:d![)H4;+H)HI 	6 	6M# 6!m%55!)M
*) /"3#-$7
 
 
 
M *!,M  6=#3"55		-00   	2-!11&+/8BOOd+%	
 
 
 	
r+   )	NNNNNNNNN)r   r   r   r   rf   r   r   r   r$   r   r   r	   r   r   r   r   r   r   r   r   s   @r)   r   r   g  sN       }      "  151537+/59$(,0/359_
 _
E,-_
 !._
 u/0	_

 "%_
   12_
 D>_
 $D>_
 'tn_
 !!12_
 +,_
 
!_
 _
 _
 ^ _
 _
 _
 _
 _
r+   r   c                       e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 	 dd	e	e
j                 d
e	e
j                 de	e
j                 de	eeee
j                 f                  de	e
j                 de	e
j                 de	e         de	e         de	e         de	e
j                 deee
j        f         dee         defd                        Z xZS )GraniteForCausalLMzlm_head.weightlm_headcolwise_repr7   logitsc                     t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        | 	                                 d S )NFrc   )
re   rf   r   r   r   r   rm   rh   r  r  r   s     r)   rf   zGraniteForCausalLM.__init__  sj       !&))
 +y!3V5FUSSS 	r+   Nr   r  rG   r2   rw   r  labelsr   r   r  r}   logits_to_keeprJ   r9   c                    ||n| j         j        }|	|	n| j         j        }	 | j        d||||||||	|
d	|}|j        }t          |t                    rt          | d          n|}|                     |dd|ddf                   }|| j         j	        z  }d}| | j
        d||| j         j        d|}t          |||j        |j        |j                  S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteForCausalLM

        >>> model = GraniteForCausalLM.from_pretrained("meta-granite/Granite-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-granite/Granite-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r  rG   r2   rw   r  r   r   r  r}   )r   r"  r   )lossr   rw   r7   r   r   )r`   r   r  r   r  r   r   slicer  logits_scalingloss_functionr   r   rw   r7   r   )rt   r  rG   r2   rw   r  r"  r   r   r  r}   r#  rJ   r   r7   slice_indicesr   r%  s                     r)   r   zGraniteForCausalLM.forward  s@   D 2C1N--TXT_Tq$8$D  $+Jj 	
 ,64: ,
)%+'/!5),
 ,
 ,
 ,
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA$+44%4%pVFt{OeppioppD%#3!/)
 
 
 	
r+   )NNNNNNNNNNr   )r   r   r   _tied_weights_keys_tp_plan_pp_planrf   r   r   r   r$   r   r   r   r	   listr   r   r   r   r   r   r   r   r   s   @r)   r  r    s       *+=)H_-z:;H      151537KO59-1$(,0/35934C
 C
E,-C
 !.C
 u/0	C

 "%tE4E/F(F"GHC
   12C
 )*C
 D>C
 $D>C
 'tnC
 !!12C
 c5</0C
 +,C
 
 C
 C
 C
 ^ C
 C
 C
 C
 C
r+   r  )r  r   r   )Nr   )rB   )=typingr   r   r   r$   r   activationsr   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   configuration_graniter   
get_loggerr   r  r*   r6   r   r   rA   Moduler   r]   r_   r   r   r   r   r   r   r  __all__r   r+   r)   <module>r@     s(  , - , , , , , , , , ,        ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) 7 7 7 7 7 7 / / / / / / 9 9 9 9 9 9 O O O O O O O O K K K K K K K K F F F F F F F F & & & & & & R R R R R R R R R R R R 0 0 0 0 0 0 / / / / / / 0 0 0 0 0 0 
	H	%	%( ( (   6	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %4D) D) D) D) D)ry D) D) D)N Y''J J J J JRY J J ('J(        K K K K K4 K K K\     _   $!< !< !< !< !<RY !< !< !<H s
 s
 s
 s
 s
) s
 s
 s
l S
 S
 S
 S
 S
/ S
 S
 S
l K
J
Jr+   