
     `i                     h   d dl mZmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ d
dlmZmZmZmZ ddlmZ  ej        e          Z G d ded          Z G d dej                  Z G d de          Z G d de          Z G d de          Z G d de          Zg dZ dS )    )Optional	TypedDictN)nn   )ACT2FN)Cache)Unpack)logging)deprecate_kwarg   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                   d    e Zd ZU dZej        ed<   ej        ed<   eed<   eed<   ej        ed<   dS )GraniteFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   r   &   sb          " ########_r$   r   F)totalc                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                 D   t                                                       |j        | _        |j        | _        t
          |j                 | _        t          j	        | j        | j        dz  d          | _
        t          j	        | j        | j        d          | _        d S )Nr   F)bias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr)   	__class__s     r%   r-   zGraniteMoeSharedMLP.__init__H   s     ,!: !23Idot7G!7KRWXXXYt'7uUUUr$   hidden_statesreturnc                     |                      |          }|                    dd          }|                     |d                   |d         z  }|                     |          }|S )Nr   )dimr   r   )r4   chunkr2   r5   )r7   r9   chunked_hidden_statess      r%   forwardzGraniteMoeSharedMLP.forwardQ   sj    ))-88 - 3 3A2 3 > >(=a(@AADYZ[D\\**=99r$   )
r   r   r   r   r   r-   r   Tensorr@   __classcell__r8   s   @r%   r(   r(   ?   s|         V5 V V V V V VU\ el        r$   r(   c                       e Zd Zdedef fdZ eddd          	 	 	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	e         de	ej
                 de	e         de	eej        ej        f                  dee         deej        e	eej        ej        f                  f         fd            Z xZS )GraniteMoeSharedDecoderLayerr)   	layer_idxc                     t                                          ||           |j        dk    rd nt          |          | _        d S )Nr   )r,   r-   r0   r(   
shared_mlp)r7   r)   rF   r8   s      r%   r-   z%GraniteMoeSharedDecoderLayer.__init__Z   sC    +++"("AQ"F"F$$L_`fLgLgr$   past_key_valuepast_key_valuesz4.58)new_nameversionNFr9   attention_maskposition_idsoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsr:   c
                 d   |}|                      |          } | j        d||||||||	d|
\  }}||| j        z  z   }|}|                     |          }|                     |          \  }}| j        |}n||                     |          z   }~||| j        z  z   }|f}|r||fz  }|r||fz  }|S )a1  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        )r9   rM   rN   rJ   rO   rP   rQ   rS   Nr#   )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moerH   )r7   r9   rM   rN   rJ   rO   rP   rQ   rR   rS   rT   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputss                   r%   r@   z$GraniteMoeSharedDecoderLayer.forward^   s   N !,,];; ,:4> 
,
')%+/) 3
,
 
,
 
,
 
,
(( !=43K#KK !55mDD+/+@+@+O+O(=?"-MM-0N0NNM =43K#KK " 	,)++G 	(''Gr$   )NNNFFNFN)r   r   r   r   r!   r-   r   r   rA   r   r   r   booltupler	   r   FloatTensorr@   rB   rC   s   @r%   rE   rE   Y   s       h5 h# h h h h h h _%0A6RRR 2637+/,1$)59/4KOO O|O !.O u/0	O
 "%O $D>O D>O !!12O 'tnO &eEL%,,F&GHO 45O 
u (51BEDU1U+V"WW	XO O O SRO O O O Or$   rE   c                        e Zd ZU eed<   dgZdS )GraniteMoeSharedPreTrainedModelr)   rE   N)r   r   r   r   r    _no_split_modulesr#   r$   r%   rd   rd      s'         """"78r$   rd   c                   $     e Zd Zdef fdZ xZS )GraniteMoeSharedModelr)   c                     t                                                     t          j        fdt	          j                  D                       | _        d S )Nc                 0    g | ]}t          |          S r#   )rE   ).0rF   r)   s     r%   
<listcomp>z2GraniteMoeSharedModel.__init__.<locals>.<listcomp>   s$    nnn)&)<<nnnr$   )r,   r-   r   
ModuleListrangenum_hidden_layerslayersr6   s    `r%   r-   zGraniteMoeSharedModel.__init__   sV       mnnnneTZTlNmNmnnn
 
r$   )r   r   r   r   r-   rB   rC   s   @r%   rg   rg      sE        
5 
 
 
 
 
 
 
 
 
 
r$   rg   c                   *     e Zd ZdgZdef fdZ xZS )GraniteMoeSharedForCausalLMzlm_head.weightr)   c                     t                                          |           t          |          | _        |                                  d S )N)r,   r-   rg   model	post_initr6   s     r%   r-   z$GraniteMoeSharedForCausalLM.__init__   s@       *622
r$   )r   r   r   _tied_weights_keysr   r-   rB   rC   s   @r%   rq   rq      sN        *+5          r$   rq   )rq   rg   rd   )!typingr   r   r   r   activationsr   cache_utilsr   processing_utilsr	   utilsr
   utils.deprecationr   granitemoe.modeling_granitemoer   r   r   r   configuration_granitemoesharedr   
get_loggerr   loggerr   Moduler(   rE   rd   rg   rq   __all__r#   r$   r%   <module>r      s    ' & & & & & & &        ! ! ! ! ! !             & & & & & &       0 0 0 0 0 0            C B B B B B 
	H	%	%    )5    2    ")   4U U U U U#9 U U Up9 9 9 9 9&? 9 9 9

 
 
 
 
O 
 
 
    "7    f
e
er$   