
     `iHv                     F   d dl mZmZmZ d dlZd dlmc mZ d dlmZ d dl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2  G d dej3                  Z4 G d dej3                  Z5 ed           G d dej3                              Z6d Z7dDdZ8dej9        d e:d!ej9        fd"Z;	 dEd$ej3        d%ej9        d&ej9        d'ej9        d(eej9                 d)e<d*e<d+e(e*         fd,Z= G d- d.ej3                  Z> G d/ d0e          Z? G d1 d2ej3                  Z@e+ G d3 d4e&                      ZAe+ G d5 d6eA                      ZB	 	 	 dFd8eej9        eCej9                 df         d9ee:         d(eej9                 d!eej9        e:f         fd:ZDe+ G d; d<eAe                      ZE G d= d>eeA          ZF G d? d@eeA          ZG G dA dBeeA          ZHg dCZIdS )G    )CallableOptionalUnionN)nn)check_model_inputs   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GenericForQuestionAnswering GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)OutputRecorder   )MixtralConfigc                   *     e Zd Zdef fdZd Z xZS )MixtralBlockSparseTop2MLPconfigc                    t                                                       |j        | _        |j        | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _	        t          j        | j        | j        d          | _
        t          |j                 | _        d S NFbias)super__init__intermediate_sizeffn_dimhidden_size
hidden_dimr   Linearw1w2w3r	   
hidden_actact_fnselfr%   	__class__s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mixtral/modeling_mixtral.pyr+   z"MixtralBlockSparseTop2MLP.__init__:   s    / ,)DOT\FFF)DL$/FFF)DOT\FFFV./    c                     |                      |                     |                    |                     |          z  }|                     |          }|S N)r5   r1   r3   r2   )r7   hidden_statescurrent_hidden_statess      r9   forwardz!MixtralBlockSparseTop2MLP.forwardE   sJ     $DGGM,B,B C CdggmF\F\ \ $(= > >$$r:   )__name__
__module____qualname__r"   r+   r?   __classcell__r8   s   @r9   r$   r$   9   sS        	0} 	0 	0 	0 	0 	0 	0% % % % % % %r:   r$   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )MixtralSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    c                 |   t                                                       j        | _        j        | _        j        | _        j        | _	        t          j        | j        | j        d          | _        t          j        fdt          | j                  D                       | _        j        | _        d S )NFr(   c                 .    g | ]}t                    S  )r$   ).0_r%   s     r9   
<listcomp>z2MixtralSparseMoeBlock.__init__.<locals>.<listcomp>a   s"    %i%i%iA&?&G&G%i%i%ir:   )r*   r+   r.   r/   r,   r-   num_local_expertsnum_expertsnum_experts_per_toktop_kr   r0   gate
ModuleListrangeexpertsrouter_jitter_noisejitter_noiser6   s    `r9   r+   zMixtralSparseMoeBlock.__init__W   s     ,/!3/
 Idot/?eLLL	}%i%i%i%iQVW[WgQhQh%i%i%ijj #6r:   r=   returnc                    |j         \  }}}| j        rF| j        dk    r;|t          j        |                              d| j        z
  d| j        z             z  }|                    d|          }|                     |          }t          j	        |dt          j
                  }t          j        || j        d          \  }}||                    dd          z  }|                    |j                  }t          j        ||z  |f|j        |j        	          }t          j        j                            || j        
                              ddd          }	t          j        |	                    d          d                                          }
|
D ]}| j        |         }t          j        |	|                             d                    \  }}|d|f                             d|          } ||          |||df         z  }|                    d||                    |j                             |                    |||          }||fS ) r   g      ?r!   dimdtyper\   T)r\   keepdim)r]   device)num_classes   )rZ   N)shapetrainingrV   torch
empty_likeuniform_viewrQ   FsoftmaxfloattopkrP   sumtor]   zerosr`   r   
functionalone_hotrN   permutegreaternonzerorT   wheresqueezereshape
index_add_)r7   r=   
batch_sizesequence_lengthr/   router_logitsrouting_weightsselected_expertsfinal_hidden_statesexpert_mask
expert_hit
expert_idxexpert_layeridxtop_xcurrent_stater>   s                    r9   r?   zMixtralSparseMoeBlock.forwardf   s]   2?2E/
OZ= 	xT.22U-m<<EEcDL]F]_beiev_vwwwM%**2z::		-00)MqLLL,1J
XZ,[,[,[))?..2t.DDD),,]-@AA#k/):6m>QZgZn
 
 
 h)112BPTP`1aaiijkmnpqrr];??x?#@#@!DDLLNN
$ 	d 	dJ<
3L[%<%D%DQ%G%GHHJC *$+6>>r:NNM$0L$?$?/RWY\^bRbBc$c!  **1e5J5M5MmNa5b5bcccc199*oWabb"M11r:   )	r@   rA   rB   __doc__r+   rf   Tensorr?   rC   rD   s   @r9   rF   rF   K   sh        	 	7 7 7 7 7%2U\ %2el %2 %2 %2 %2 %2 %2 %2 %2r:   rF   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )MixtralRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z=
        MixtralRMSNorm is equivalent to T5LayerNorm
        N)r*   r+   r   	Parameterrf   onesweightvariance_epsilon)r7   r.   epsr8   s      r9   r+   zMixtralRMSNorm.__init__   sD     	l5:k#:#:;; #r:   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )Nrb   rZ   T)r_   )	r]   ro   rf   float32powmeanrsqrtr   r   )r7   r=   input_dtypevariances       r9   r?   zMixtralRMSNorm.forward   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r:   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler   rd   r   )r7   s    r9   
extra_reprzMixtralRMSNorm.extra_repr   s&    )**II$2GIIIr:   )r   )r@   rA   rB   r+   r?   r   rC   rD   s   @r9   r   r      sb        $ $ $ $ $ $; ; ;J J J J J J Jr:   r   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..NrZ   rb   r^   )rd   rf   cat)xx1x2s      r9   rotate_halfr      s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r:   c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r9   apply_rotary_pos_embr      sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr:   r=   n_reprW   c                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)rd   expandrx   )r=   r   batchnum_key_value_headsslenhead_dims         r9   	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr:           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nrb   r   rc   rZ   r[   )pre   r!   )r   num_key_value_groupsrf   matmul	transposerd   r   rq   rk   r   ro   r]   r   re   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r9   eager_attention_forwardr      s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r:   c                        e Zd ZdZdedef fdZ eddd          	 	 dd
ej	        de
ej	        ej	        f         deej	                 dee         deej                 dee         de
ej	        eej	                 f         fd            Z xZS )MixtralAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr%   	layer_idxc                    t                                                       || _        || _        t	          |dd           p|j        |j        z  | _        |j        |j        z  | _	        | j        dz  | _
        |j        | _        d| _        t          j        |j        |j        | j        z  d          | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        |j        | j        z  d          | _        t          j        |j        | j        z  |j        d          | _        d S )Nr   g      TFr(   )r*   r+   r%   r   getattrr.   num_attention_headsr   r   r   r   attention_dropout	is_causalr   r0   q_projk_projv_projo_projr7   r%   r   r8   s      r9   r+   zMixtralAttention.__init__   s    "
D99mV=OSYSm=m$*$>&B\$\!}d*!'!9i 2F4NQUQ^4^ejkkki 2F4NQUQ^4^ejkkki 2F4NQUQ^4^ejkkki :T] JFL^ejkkkr:   past_key_valuepast_key_values4.58new_nameversionNr=   position_embeddingsr   cache_positionr   rW   c           
      n   |j         d d         }g |d| j        R }|                     |                              |                              dd          }	|                     |                              |                              dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|                    |
|| j	        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        sdn| j        | j        t#          | j        dd           d|\  }} |j        g |dR                                  }|                     |          }||fS )	NrZ   r!   rb   )r   r   r   eagerr   sliding_window)r   r   r   )rd   r   r   ri   r   r   r   r   updater   r   r%   _attn_implementationr   re   r   r   r   rx   r   r   )r7   r=   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r9   r?   zMixtralAttention.forward   s    $)#2#.88b8$-88{{=1166|DDNNqRSTT[[//44\BBLLQPQRR
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7
%
  $}HCC$2HL"4;0@$GG
%
 
%
 
%
 
%
!\ *k);;;;;;FFHHkk+..L((r:   )NN)r@   rA   rB   r   r"   intr+   r   rf   r   r   r   r
   
LongTensorr   r   r?   rC   rD   s   @r9   r   r      s       GGl} l l l l l l l _%0A6RRR ,059*) *)|*) #5<#=>*) !.	*)
 "%*) !!12*) -.*) 
u|Xel33	4*) *) *) SR*) *) *) *) *)r:   r   c                       e Zd Zdedef fdZ eddd          	 	 	 	 dd	ej        d
e	ej        ej        f         de
ej                 de
ej                 de
e         de
ej                 dee         dej        fd            Z xZS )MixtralDecoderLayerr%   r   c                 2   t                                                       |j        | _        t          ||          | _        t          |          | _        t          |j        |j                  | _	        t          |j        |j                  | _
        d S )Nr   )r*   r+   r.   r   	self_attnrF   block_sparse_moer   rms_norm_epsinput_layernormpost_attention_layernormr   s      r9   r+   zMixtralDecoderLayer.__init__+  s    !-)&)<< 5f = =-f.@fFYZZZ(6v7IvOb(c(c(c%%%r:   r   r   r   r   Nr=   r   r   r   r   r   rW   c           
          |}|                      |          } | j        d||||||d|\  }}	||z   }|}|                     |          }|                     |          \  }}	||z   }|S )N)r=   r   r   r   r   r   rI   )r   r   r   r   )
r7   r=   r   r   r   r   r   r   residualrK   s
             r9   r?   zMixtralDecoderLayer.forward5  s     !,,];; *4> 
' 3)%+)
 
 
 
q !=0 !55mDD00??q =0r:   )NNNN)r@   rA   rB   r"   r   r+   r   rf   r   r   r   r   r
   r   r   FloatTensorr?   rC   rD   s   @r9   r   r   *  s       d} d d d d d d d _%0A6RRR
 2637+/59   |  #5<#=>  !.	 
 u/0  "%  !!12  +,  
	      SR         r:   r   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )MixtralRotaryEmbeddinginv_freqNr%   c                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)r*   r+   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr%   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r7   r%   r`   r   r8   s       r9   r+   zMixtralRotaryEmbedding.__init__\  s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r:   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   rZ   r!   mpscpuF)device_typeenabledrb   r^   )r]   )r   rl   r   rd   ro   r`   r  r   strrf   autocastr   r   r   r	  r   r]   )
r7   r   r   inv_freq_expandedposition_ids_expandedr  freqsembr   r   s
             r9   r?   zMixtralRotaryEmbedding.forwardm  s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/r<   )r@   rA   rB   rf   r   __annotations__r"   r+   no_gradr   r?   rC   rD   s   @r9   r   r   Y  s         l/ /} / / / / / /" U]__< <  _< < < < <r:   r   c                   d    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZ eed          eed	Zd
S )MixtralPreTrainedModelr%   modelTr   r   Fr!   )index)r|   r=   
attentionsN)r@   rA   rB   r"   r  base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr    rF   r   r   _can_record_outputsrI   r:   r9   r  r  }  s         &*#./#4"5N""&'(=QGGG,& r:   r  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 dee         deej                 d	ee         d
eej	                 dee         defd                        Z xZS )MixtralModelr%   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        t!                    | _        d| _        |                                  d S )Nc                 0    g | ]}t          |          S rI   )r   )rJ   r   r%   s     r9   rL   z)MixtralModel.__init__.<locals>.<listcomp>  s$    eee	 33eeer:   r   r%   F)r*   r+   pad_token_idpadding_idx
vocab_sizer   	Embeddingr.   embed_tokensrR   rS   num_hidden_layerslayersr   r   normr   
rotary_embgradient_checkpointing	post_initr6   s    `r9   r+   zMixtralModel.__init__  s       !. +L):F<NPTP`aameeeeU6KcEdEdeee
 
 #6#56;NOOO	0???&+# 	r:   N	input_idsr   r   r   inputs_embeds	use_cacher   r   rW   c                 |   |d u |d uz  rt          d          |r|t          | j                  }||                     |          }|B||                                nd}	t          j        |	|	|j        d         z   |j                  }||	                    d          }| j        j
        t          nt          }
 |
| j        |||||          }|}|                     ||          }| j        d | j        j                 D ]} ||f||||||d|}|                     |          }t#          ||          S )	Nz:You must specify exactly one of input_ids or inputs_embedsr,  r   r!   )r`   )r%   input_embedsr   r   r   r   )r   r   r   r   r:  r   )last_hidden_stater   )
ValueErrorr   r%   r1  get_seq_lengthrf   arangerd   r`   r   r   r   r   r5  r3  r2  r4  r   )r7   r8  r   r   r   r9  r:  r   r   past_seen_tokensmask_functionr   r=   r   decoder_layers                  r9   r?   zMixtralModel.forward  s    -t";< 	[YZZZ 	?0*$+>>>O  --i88M!CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L.2k.H.P**Vw#m;&))+%
 
 
 & #oom\JJ![)H4;+H)HI 
	 
	M)M	$7*) /#-	 	 	 	MM 		-00%++
 
 
 	
r:   )NNNNNNN)r@   rA   rB   r"   r+   r   r   r   rf   r   r   r
   r   boolr   r   r   r?   rC   rD   s   @r9   r)  r)    s       }         151537+/59$(59<
 <
E,-<
 !.<
 u/0	<

 "%<
   12<
 D><
 !!12<
 +,<
 
 <
 <
 <
 ^ <
 <
 <
 <
 <
r:   r)  rb   gate_logitsrN   c                    | t          | t                    sdS t          | t                    r/| d         j        t          j        fd| D             d          }t          j        j                            |d          }t          j        ||d          \  }}t          j        j        	                    ||          }|@t          j
        |                                d          }	t          j
        |d          }
n.|j        \  }}|j        d         ||z  z  }|dddddddf                             |||||f                              d||                                        }t          j        |                                |z  d          t          j        |d          z  }	|ddddddf                             ||||f                              d|                                        }t          j        ||z  d          t          j        |d          z  }
t          j        |	|
                    d          z            }||z  S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                 :    g | ]}|                               S rI   )ro   )rJ   
layer_gatecompute_devices     r9   rL   z,load_balancing_loss_func.<locals>.<listcomp>  s&    -j-j-jPZjmmN.K.K-j-j-jr:   r^   rZ   )r  r   r`   rf   r   r   rq   rk   rm   rr   r   rl   rd   r   rx   ro   rn   r   )rE  rN   rP   r   concatenated_gate_logitsr}   rK   r~   r   tokens_per_expertrouter_prob_per_expertrz   r{   r2  expert_attention_mask router_per_expert_attention_maskoverall_lossrI  s                    @r9   load_balancing_loss_funcrP    s   : *[%"@"@q+u%% s$Q.#(9-j-j-j-j^i-j-j-jpq#r#r#r h)112JPR1SSO*_eDDDA(%--.>LLK!J{'8'8':':BBB "'O!C!C!C&4&:#
O4:1=*B^_ 4AAAtT12V&
OUKXYYWR,,R	 	 "Ik&7&7&9&9<Q&QWXYYY\a\e!q]
 ]
 ]
 
 4AAAt+,V&
O[QRRWR%%R	 	) "'?=]+]cd!e!e!ehmhq,!i
 i
 i
 "
 9.1G1Q1QRS1T1TTUUL+%%r:   c                   x    e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 dd	e	e
j                 d
e	e
j                 de	e
j                 de	e         de	e
j                 de	e
j                 de	e         de	e         de	e
j                 deee
j        f         dee         defd                        Z xZS )MixtralForCausalLMzlm_head.weightlm_headcolwise_repr=   logitsc                 F   t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        |j	        | _	        |j
        | _        |j        | _        |                                  d S r'   )r*   r+   r)  r  r/  r   r0   r.   rS  router_aux_loss_coefrM   rN   rO   r7  r6   s     r9   r+   zMixtralForCausalLM.__init__;  s       !&))
 +y!3V5FUSSS$*$?!!3#)#=  	r:   Nr   r8  r   r   r   r9  labelsr:  output_router_logitsr   logits_to_keepr   rW   c                    ||n| j         j        } | j        d||||||||	d|}|j        }t	          |
t
                    rt          |
 d          n|
}|                     |dd|ddf                   }d}| | j        ||| j	        fi |}d}|rHt          |j        | j        | j        |          }|%|| j        |                    |j                  z  z  }t#          ||||j        |j        |j        |j                  S )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MixtralForCausalLM

        >>> model = MixtralForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r8  r   r   r   r9  r:  rY  r   )lossaux_lossrU  r   r=   r  r|   rI   )r%   rY  r  r=  r  r   slicerS  loss_functionr/  rP  r|   rN   rO   rW  ro   r`   r   r   r=   r  )r7   r8  r   r   r   r9  rX  r:  rY  r   rZ  r   outputsr=   slice_indicesrU  r\  r]  s                     r9   r?   zMixtralForCausalLM.forwardG  sq   P %9$D  $+Jj 	
 +5$* 
+
)%+'!5)
+
 
+
 
+
 
+
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%ffdoPPPPD 	M/% (	 H !1HKK4L4LLL(#3!/)!/
 
 
 	
r:   )
NNNNNNNNNr   )r@   rA   rB   _tied_weights_keys_tp_plan_pp_planr+   r   r   r   rf   r   r   r
   r   rD  r   r   r   r   r   r?   rC   rD   s   @r9   rR  rR  5  s       *+=)H_-z:;H
 
 
 
 
  151537+/59-1$(/35934R
 R
E,-R
 !.R
 u/0	R

 "%R
   12R
 )*R
 D>R
 'tnR
 !!12R
 c5</0R
 +,R
 
#R
 R
 R
 ^ R
 R
 R
 R
 R
r:   rR  c                       e Zd ZdS ) MixtralForSequenceClassificationNr@   rA   rB   rI   r:   r9   rf  rf            Dr:   rf  c                       e Zd ZdS )MixtralForTokenClassificationNrg  rI   r:   r9   rj  rj    rh  r:   rj  c                       e Zd ZdS )MixtralForQuestionAnsweringNrg  rI   r:   r9   rl  rl    rh  r:   rl  )rR  rl  r)  r  rf  rj  )Nr!   )r   )Nrb   N)Jtypingr   r   r   rf   torch.nn.functionalr   rq   rj   transformers.utils.genericr   activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   r   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr    configuration_mixtralr"   Moduler$   rF   r   r   r   r   r   r   rl   r   r   r   r   r  r)  r   rP  rR  rf  rj  rl  __all__rI   r:   r9   <module>r     s  6 - , , , , , , , , ,                 9 9 9 9 9 9 ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) 7 7 7 7 7 7 R R R R R R R R B B B B B B            R Q Q Q Q Q Q Q K K K K K K K K F F F F F F F F & & & & & & I I I I I I I I I I 0 0 0 0 0 0 + + + + + + 0 0 0 0 0 0% % % % %	 % % %$@2 @2 @2 @2 @2BI @2 @2 @2F Y''J J J J JRY J J ('J(( ( (   6	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %4<) <) <) <) <)ry <) <) <)~, , , , ,4 , , ,^!< !< !< !< !<RY !< !< !<H     _   $ O
 O
 O
 O
 O
) O
 O
 O
h "&
-1	O& O&u|U5<%8$>?O&#O& U\*	O&
 5<O& O& O& O&d e
 e
 e
 e
 e
/ e
 e
 e
P	 	 	 	 	'GI_ 	 	 		 	 	 	 	$ACY 	 	 		 	 	 	 	"=?U 	 	 	  r:   