
     `i-B                        d dl mZmZ d dlZd dlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ d
dlmZ d
dlmZmZmZ d
dlmZmZmZmZmZmZmZ ddl m!Z!  ej"        e#          Z$ G d de          Z% G d de          Z& G d de          Z' G d de          Z( G d de          Z) G d de          Z* G d de          Z+ G d de          Z,g d Z-dS )!    )OptionalUnionN)nn   )Cache)BaseModelOutputWithPastMoeModelOutputWithPast)Unpack)auto_docstringcan_return_tuplelogging)deprecate_kwarg   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedPreTrainedModel   )GraniteMoeHybridConfigc                   (     e Zd Zdedef fdZ xZS )GraniteMoeHybridAttentionconfig	layer_idxc                 L    t                                          ||           d S Nsuper__init__selfr   r    	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr%   z"GraniteMoeHybridAttention.__init__,   s#    +++++    __name__
__module____qualname__r   intr%   __classcell__r(   s   @r)   r   r   +   sL        ,5 ,# , , , , , , , , , ,r*   r   c                   (     e Zd Zdedef fdZ xZS )GraniteMoeHybridMambaLayerr   r    c                 f    t                                          t          |          |           d S r"   )r$   r%   r   r&   s      r)   r%   z#GraniteMoeHybridMambaLayer.__init__1   s+    V,,i88888r*   r+   r1   s   @r)   r3   r3   0   sL        95 9# 9 9 9 9 9 9 9 9 9 9r*   r3   c                         e Zd Zd fd	Z xZS )GraniteMoeHybridRMSNormGatedư>c                 L    t                                          ||           d S r"   r#   )r'   hidden_sizeepsr(   s      r)   r%   z%GraniteMoeHybridRMSNormGated.__init__6   s#    c*****r*   )r7   )r,   r-   r.   r%   r0   r1   s   @r)   r6   r6   5   s=        + + + + + + + + + +r*   r6   c                   $     e Zd Zdef fdZ xZS )GraniteMoeHybridMLPr   c                 J    t                                          |           d S r"   r#   r'   r   r(   s     r)   r%   zGraniteMoeHybridMLP.__init__;   s!         r*   )r,   r-   r.   r   r%   r0   r1   s   @r)   r<   r<   :   sE        !5 ! ! ! ! ! ! ! ! ! !r*   r<   c                   |    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 	 dd
ej        de	ej                 de	e
         de	e         de	e         de	ej                 de	e         de	eej        ej        f                  dee         deej        e	eej        ej        f                  f         fd            Z xZS )GraniteMoeHybridDecoderLayerr   r    c                 `   t                                          ||           t          |          | _        d | _        d | _        |j        |         dk    rt          ||          | _        nt          ||          | _        |j        |         | _	        t          |dd          dk    | _        d S )Nmambanum_local_expertsr   )r$   r%   r<   
shared_mlp	self_attnrB   layers_block_typer3   r   
layer_typegetattrhas_expertsr&   s      r)   r%   z%GraniteMoeHybridDecoderLayer.__init__@   s    +++-f55
#I.'993FIFFDJJ6vyIIDN 29= #6+>BBQFr*   past_key_valuepast_key_valuesz4.58)new_nameversionNFhidden_statesattention_maskoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsreturnc	                    |}
|                      |          }| j         | j        d||||d|	}d}n | j        d|||||||d|	\  }}|
|| j        z  z   }|}
|                     |          }| j        r1|                     |          \  }}||                     |          z   }n|                     |          }d}|
|| j        z  z   }|f}|r||fz  }|r||fz  }|S )a0  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs.Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        N)rN   rR   cache_paramsrO   )rN   rO   rK   rP   rQ   rR   rT    )input_layernormrB   rE   residual_multiplierpost_attention_layernormrI   block_sparse_moerD   )r'   rN   rO   rK   rP   rQ   rR   rS   rT   rU   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputss                  r)   forwardz$GraniteMoeHybridDecoderLayer.forwardP   so   L !,,];;:!&DJ +-,-	 
  M !%/=t~ 	0+- /"3#-$7	0 	0 	0 	0,M, !=43K#KK !55mDD 	!/3/D/D]/S/S,}-0N0NNMM OOM::M M =43K#KK " 	,)++G 	(''Gr*   )NNFFNFN)r,   r-   r.   r   r/   r%   r   torchTensorr   r   bool
LongTensortupler
   r   FloatTensorrc   r0   r1   s   @r)   r@   r@   ?   sv       G5 G# G G G G G G  _%0A6RRR 26+/,1$)59/4KOU U|U !.U "%	U
 $D>U D>U !!12U 'tnU &eEL%,,F&GHU 45U 
u (51BEDU1U+V"WW	XU U U SRU U U U Ur*   r@   c                   4     e Zd ZU eed<   dgZdZ fdZ xZS )GraniteMoeHybridPreTrainedModelr   r@   Tc                    t                                          |           t          |t                    ry|j        j                            d           t          j        t          j	        d|j
        dz                       |j        _        |j        j                            d           d S t          |t                    r!|j        j                            d           d S d S )Ng      ?r   )r$   _init_weights
isinstancer3   dt_biasdatafill_rd   logarange	num_headsA_logDr6   weight)r'   moduler(   s     r)   rm   z-GraniteMoeHybridPreTrainedModel._init_weights   s    f%%%f899 	*N%%c*** %	%,q&:JQ:N*O*O P PFLHM$$$$$ <== 	*M$$S)))))	* 	*r*   )	r,   r-   r.   r   __annotations___no_split_modules_is_statefulrm   r0   r1   s   @r)   rk   rk      sT         """"78L* * * * * * * * *r*   rk   c                       e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 deeeeej                 f                  deej                 d	ee         d
ee         dee         dee         dee         deej	                 dee         deeef         fd                        Zd Z xZS )GraniteMoeHybridModelr   c                     t                                                     t          j        fdt	          j                  D                       | _        d S )Nc                 0    g | ]}t          |          S rY   )r@   ).0r    r   s     r)   
<listcomp>z2GraniteMoeHybridModel.__init__.<locals>.<listcomp>   s$    nnn)&)<<nnnr*   )r$   r%   r   
ModuleListrangenum_hidden_layerslayersr>   s    `r)   r%   zGraniteMoeHybridModel.__init__   sV       mnnnneTZTlNmNmnnn
 
r*   N	input_idsrO   position_idsrK   inputs_embedsrQ   rP   output_hidden_statesrS   return_dictrR   rU   rV   c                 P   ||n| j         j        }||n| j         j        }||n| j         j        }|
|
n| j         j        }
|d u |d uz  rt          d          | j        r%| j        r|rt          	                    d           d}|| 
                    |          }|| j        z  }|r|t          	                    d           |B||                                nd}t          j        |||j        d         z   |j                  }||                    d          }|                     |||||          }|                     ||          }|}d }| j        |                     ||          }|rdnd }|rdnd }|	rdnd }| j        D ]^}|j        d	k    r|n|}|r||fz  } ||f||||||	|d
|}|d         }|r|d         ||d         fz  }|	r|d         ||d         fz  }_|                     |          }|r||fz  }|r|j        sd|_        t3          |||||          S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   devicerY   rB   )rO   rK   rP   rQ   rR   rS   rT   T)last_hidden_staterK   rN   
attentionsra   )r   rP   r   rQ   use_return_dict
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensembedding_multiplierget_seq_lengthrd   rs   shaper   	unsqueeze_update_causal_mask_update_mamba_mask
rotary_embr   rG   normhas_previous_stater	   )r'   r   rO   r   rK   r   rQ   rP   r   rS   r   rR   rU   past_seen_tokenscausal_mask
mamba_maskrN   rT   all_hidden_statesall_self_attnsall_router_logitsdecoder_layer
layer_masklayer_outputss                           r)   rc   zGraniteMoeHybridModel.forward   sN   " 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]-t";< 	[YZZZ& 	4= 	Y 	j   I  --i88M%(AA  	0K  
 !CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L..M>?L]
 
 ,,^^LL
 &"?&"&//-"N"N #7@BBD0:d"6@BBD![ 	> 	>M'4'?7'J'JP[J# 6!m%55!)M
) /"3#-%9$7
 
 
 
M *!,M  : #/"}Q'7&99N# > $0%-*;)==%		-00   	2-!11 	6?#E 	615O.%+++%+
 
 
 	
r*   c                 Z    |}|d         dk    s|t          j        |dk              rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )rd   all)r'   rO   rR   r   s       r)   r   z(GraniteMoeHybridModel._update_mamba_mask6  s>     $
!q  ^%?EIn`aNaDbDb%?Jr*   )NNNNNNNNNNN)r,   r-   r.   r   r%   r   r   r   rd   rg   re   r   r   listri   rf   r
   r   rh   r   rc   r   r0   r1   s   @r)   r}   r}      s       
5 
 
 
 
 
 
  151537KO59$(,0/3/3&*59s
 s
E,-s
 !.s
 u/0	s

 "%tE4E/F(F"GHs
   12s
 D>s
 $D>s
 'tns
 'tns
 d^s
 !!12s
 45s
 
u--	.s
 s
 s
 ^ s
j	 	 	 	 	 	 	r*   r}   c                   >     e Zd ZdgZdef fdZ	 	 	 	 	 	 ddZ xZS )GraniteMoeHybridForCausalLMzlm_head.weightr   c                     t                                          |           t          |          | _        |                                  d S r"   )r$   r%   r}   model	post_initr>   s     r)   r%   z$GraniteMoeHybridForCausalLM.__init__E  s@       *622
r*   NTc                    |d u }	|	s]||d         |j         d         k    r|d d |j         d          d f         }nX|j         d         |j         d         k    r|d d |f         }n/|r-t          | j        |j         d         | j        | j                  }|b|`|                                                    d          dz
  }|                    |dk    d           |	s|d d |j         d          d f         }||	rd|i}
nd|                                i}
|
	                    |||||d           |
                                D ]\  }}||
vr||
|<   |
S )Nr   r   r   r   r   r   )r   rK   rQ   rO   rR   )r   r   r   dtyper   longcumsummasked_fill_
contiguousupdateitems)r'   r   rK   rO   r   rR   r   rQ   rU   empty_past_kvmodel_inputskeyvalues                r)   prepare_inputs_for_generationz9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationK  s    (4/  	)!"%);;;%aaa.*>q*A)A)C)C&CD		#~';A'>>>%aaa&78	 	>Y_Q/DK  O %,*>)..0077;;a?L%%n&91===  F+AAA	0B/B/D/D,DE $$+];LL')=)=)?)?@L ,#2&"0"0 	
 	
 	
 !,,.. 	* 	*JC,&&$)S!r*   )NNNNNT)r,   r-   r.   _tied_weights_keysr   r%   r   r0   r1   s   @r)   r   r   B  sv        *+5       = = = = = = = =r*   r   )r   r}   rk   ).typingr   r   rd   r   cache_utilsr   modeling_outputsr   r	   processing_utilsr
   utilsr   r   r   utils.deprecationr   bamba.configuration_bambar   bamba.modeling_bambar   r   r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   configuration_granitemoehybridr   
get_loggerr,   r   r   r3   r6   r<   r@   rk   r}   r   __all__rY   r*   r)   <module>r      s    # " " " " " " "                    O O O O O O O O & & & & & & > > > > > > > > > > 0 0 0 0 0 0 3 3 3 3 3 3 b b b b b b b b b b                  C B B B B B 
	H	%	%, , , , , 9 , , ,
9 9 9 9 9 9 9 9
+ + + + +#4 + + +
! ! ! ! !- ! ! !
g g g g g#? g g gT* * * * *&E * * *G G G G G1 G G GTF F F F F"= F F FR f
e
er*   