
     `il.                     Z   d dl mZmZ d dlZd dlmZ ddlmZmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ  ej        e          Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d de          Z#g dZ$dS )    )OptionalUnionN)nn   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargslogging)deprecate_kwarg   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                   :     e Zd ZdZddedee         f fdZ xZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                 d    t                                          ||           |j        | _        d S N)super__init__attention_multiplierscalingselfr   r   	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/granite/modular_granite.pyr   zGraniteAttention.__init__+   s+    +++2    r   )	__name__
__module____qualname____doc__r   r   intr   __classcell__r$   s   @r%   r   r   (   sZ        GG3 3} 3# 3 3 3 3 3 3 3 3 3 3r&   r   c                   v    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	e         de	ej
                 de	eej        ej        f                  deej        e	eej        ej        f                  f         fd            Z xZS )GraniteDecoderLayerr   r   c                     t                                          ||           |j        | _        t          ||          | _        d S )N)r   r   )r   r   residual_multiplierr   	self_attnr"   s      r%   r   zGraniteDecoderLayer.__init__1   s@    +++#)#= )9MMMr&   past_key_valuepast_key_valuesz4.58)new_nameversionNFhidden_statesattention_maskposition_idsoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                    |}
|                      |          } | j        d||||||||d|	\  }}|
|| j        z  z   }|}
|                     |          }|                     |          }|
|| j        z  z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r7   r8   r9   r4   r:   r;   r<   r=    )input_layernormr2   r1   post_attention_layernormmlp)r#   r7   r8   r9   r4   r:   r;   r<   r=   kwargsresidualself_attn_weightsoutputss                r%   forwardzGraniteDecoderLayer.forward6   s    F !,,];; ,:4> 
,
')%+/) 3
,
 
,
 
,
 
,
(( !=43K#KK !55mDD// =43K#KK " 	,)++Gr&   )NNNFFNN)r'   r(   r)   r   r+   r   r   torchTensorr   
LongTensorr   booltupleFloatTensorrH   r,   r-   s   @r%   r/   r/   0   sV       N} N N N N N N N
 _%0A6RRR 2637+/,1$)59KO? ?|? !.? u/0	?
 "%? $D>? D>? !!12? &eEL%,,F&GH? 
u (51BEDU1U+V"WW	X? ? ? SR? ? ? ? ?r&   r/   c                       e Zd ZdS )GranitePreTrainedModelN)r'   r(   r)   r@   r&   r%   rP   rP   y   s        Dr&   rP   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee	         deej
                 d	ee         d
ee         dee         deej                 dee         defdZ xZS )GraniteModelr   c                     t                                                     j        | _        t          j        fdt          j                  D                       | _        d S )Nc                 0    g | ]}t          |          S r@   )r/   ).0r   r   s     r%   
<listcomp>z)GraniteModel.__init__.<locals>.<listcomp>   s$    eee	 33eeer&   )r   r   embedding_multiplierr   
ModuleListrangenum_hidden_layerslayers)r#   r   r$   s    `r%   r   zGraniteModel.__init__~   s`       $*$?!meeeeU6KcEdEdeee
 
r&   N	input_idsr8   r9   r4   inputs_embedsr;   r:   output_hidden_statesr<   rD   r>   c
                    ||n| j         j        }||n| j         j        }||n| j         j        }|d u |d uz  rt	          d          | j        r%| j        r|rt                              d           d}|| 	                    |          }|| j
        z  }|r|t          | j                   }|	B||                                nd}t          j        |||j        d         z   |j                  }	||	                    d          }t%          | j         |||	||          }|}|                     ||          }|rd	nd }|rd	nd }| j        d | j         j                 D ]1}|r||fz  } ||f||||||	|d
|
}|d         }|r||d         fz  }2|                     |          }|r||fz  }t/          ||r|nd ||          S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)r   r   r   )device)r   input_embedsr8   r<   r4   r9   r@   )r8   r9   r4   r:   r;   r<   r=   )last_hidden_stater4   r7   
attentions)r   r:   r^   r;   
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensrW   r   get_seq_lengthrI   arangeshaper`   	unsqueezer	   
rotary_embr[   rZ   normr
   )r#   r\   r8   r9   r4   r]   r;   r:   r^   r<   rD   past_seen_tokenscausal_maskr7   r=   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r%   rH   zGraniteModel.forward   s    2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	-t";< 	[YZZZ& 	4= 	Y 	j   I  --i88M%(AA 	?0*$+>>>O!CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L(;&))+%
 
 
 & #oom\JJ #7@BBD0:d![)H4;+H)HI 	6 	6M# 6!m%55!)M
*) /"3#-$7
 
 
 
M *!,M  6=#3"55		-00   	2-!11&+/8BOOd+%	
 
 
 	
r&   )	NNNNNNNNN)r'   r(   r)   r   r   r   rI   rK   rJ   r   rN   rL   r   r   r
   rH   r,   r-   s   @r%   rR   rR   }   s7       
} 
 
 
 
 
 
 151537+/59$(,0/359_
 _
E,-_
 !._
 u/0	_

 "%_
   12_
 D>_
 $D>_
 'tn_
 !!12_
 +,_
 
!_
 _
 _
 _
 _
 _
 _
 _
r&   rR   c                   b   e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deeee	ej
                 f                  deej
                 deej                 d	ee         d
ee         dee         deej                 deeej        f         dee         defdZdS )GraniteForCausalLMNr   r\   r8   r9   r4   r]   labelsr;   r:   r^   r<   logits_to_keeprD   r>   c                    ||n| j         j        }|	|	n| j         j        }	 | j        d||||||||	|
d	|}|j        }t          |t                    rt          | d           n|}|                     |d d |d d f                   }|| j         j	        z  }d }| | j
        d||| j         j        d|}t          |||j        |j        |j                  S )N)	r\   r8   r9   r4   r]   r;   r:   r^   r<   )logitsrx   
vocab_size)lossr{   r4   r7   rc   r@   )r   r:   r^   modelrb   
isinstancer+   slicelm_headlogits_scalingloss_functionr|   r   r4   r7   rc   )r#   r\   r8   r9   r4   r]   rx   r;   r:   r^   r<   ry   rD   rG   r7   slice_indicesr{   r}   s                     r%   rH   zGraniteForCausalLM.forward   s?    2C1N--TXT_Tq$8$D  $+Jj 	
 ,64: ,
)%+'/!5),
 ,
 ,
 ,
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA$+44%4%pVFt{OeppioppD%#3!/)
 
 
 	
r&   )NNNNNNNNNNr   )r'   r(   r)   r   rI   rK   rJ   r   r   listrN   rL   r+   r   r   r   rH   r@   r&   r%   rw   rw      s4        151537KO59-1$(,0/359342
 2
E,-2
 !.2
 u/0	2

 "%tE4E/F(F"GH2
   122
 )*2
 D>2
 $D>2
 'tn2
 !!122
 c5</02
 +,2
 
 2
 2
 2
 2
 2
 2
r&   rw   )rw   rR   rP   )%typingr   r   rI   r   cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   processing_utilsr   utilsr   r   utils.deprecationr   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr'   rg   r   r/   rP   rR   rw   __all__r@   r&   r%   <module>r      s     # " " " " " " "        . . . . . . . . / / / / / / O O O O O O O O & & & & & & 0 0 0 0 0 0 0 0 0 0 0 0 0 0              1 0 0 0 0 0 
	H	%	%3 3 3 3 3~ 3 3 3F F F F F+ F F FR	 	 	 	 	1 	 	 	g
 g
 g
 g
 g
: g
 g
 g
T3
 3
 3
 3
 3
) 3
 3
 3
l K
J
Jr&   