
     `iQ                        d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ  ej        e          Zd Z  G d dej!                  Z"d Z#d.dZ$	 d/dej!        dej%        dej%        dej%        deej%                 de&de&fdZ' G d d ej!                  Z( G d! d"ej!                  Z) G d# d$ej!                  Z* G d% d&e          Z+ G d' d(ej!                  Z,e G d) d*e                      Z-d+ Z.e G d, d-e-                      Z/d-d*gZ0dS )0zPyTorch Pixtral model.    )Callable)OptionalUnionN)nn   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )PixtralVisionConfigc                    g }| D ]}|j         dd          \  }}t          j        t          j        |          t          j        |          d          }t          j        |d                              dd                              dd          \  }}||z  |z   }	|                    |	d d df                    t          j        |          S )Nij)indexingdim   r   )	shapetorchmeshgridarangestackreshapechunkappendcat)
patch_embeds_list	max_width	positionspatchheightwidthmeshh_gridv_grididss
             /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/pixtral/modeling_pixtral.pyposition_ids_in_meshgridr1   %   s    I" $ $BCC(~el622EL4G4GRVWWWTr222::2qAAGG2NNy 6)QQQT####9Y    c                   z     e Zd ZU dZej        ed<   d fd	Z ej                    e	d                         Z
 xZS )PixtralRotaryEmbeddinga  
    The key with pixtral embedding is just that you have a frequency for each pixel positions.
    If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
    is given by indexing the pre_computed frequency on the width and height.

    What you output is of dimension (batch, height * width, dim) with dim the embed dim.

    This simply means that for each image hidden state, you are going to add
    a corresponding positional embedding, based on its index in the grid.
    inv_freqNc                    t                                                       d| _        |j        | _        |j        | _        |j        |j        z  }d| j        t          j
        d| j        d                                          | j        z  z  z  }t          j
        ||j                  }t          j
        ||j                  }t          j        ||d d d                                                   }t          j        ||dd d                                                   }t          j        |d d d d d f                             d|d          |d d d d d f                             |dd          gd                              d| j        dz            }	|                     d	t          j        |	|	fd          d
           d S )Ndefault      ?r   r   )devicer   r   r   r5   F)
persistent)super__init__	rope_typehead_dimr   
rope_thetabase
image_size
patch_sizer   r    floatr9   outerr%   repeatr"   register_buffer)selfconfigr9   max_patches_per_sidefreqshwfreqs_hfreqs_wr5   	__class__s             r0   r<   zPixtralRotaryEmbedding.__init__>   s   "?%	%0F4EEtyU\!TXq%A%A%G%G%I%IDH%TUVL-elCCCL-elCCC+asss,,2244+aqt!t--335594
#**1.BAFFaaa
#**+?AFF 
 
 
 '"dh!m
$
$ 	 	ZHh3GR)P)P)P]bcccccr2   c                    | j         |         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j        |d          5  |}|                                }|                                }d d d            n# 1 swxY w Y   |	                    |j
                  |	                    |j
                  fS )NmpscpuF)device_typeenabled)dtype)r5   
isinstancer9   typestrr   autocastcossintorU   )rG   xposition_idsrJ   rS   embrZ   r[   s           r0   forwardzPixtralRotaryEmbedding.forwardW   s     l+'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	 	C''))C''))C	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 vvAGv$$cff17f&;&;;;s   !+BBBN)__name__
__module____qualname____doc__r   Tensor__annotations__r<   no_gradr   r`   __classcell__rO   s   @r0   r4   r4   0   s         	 	 ld d d d d d2 U]__	< 	<  _	< 	< 	< 	< 	<r2   r4   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..Nr   r   r   )r   r   r%   )r]   x1x2s      r0   rotate_halfrn   f   s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r2   c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezern   )qkrZ   r[   r^   unsqueeze_dimq_embedk_embeds           r0   apply_rotary_pos_embrv   m   sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr2           modulequerykeyvalueattention_maskscalingdropoutc                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nr   r   )r   rU   )ptrainingr   r   )r   matmul	transposer   
functionalsoftmaxfloat32r\   rU   r~   r   
contiguous)
rx   ry   rz   r{   r|   r}   r~   kwargsattn_weightsattn_outputs
             r0   eager_attention_forwardr      s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r2   c                        e Zd ZdZ fdZ	 	 	 ddej        deej                 deeej        ej        f                  dee	         d	e
e         d
eej        eej                 f         fdZ xZS )PixtralAttentionzI
    Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
    c                 0   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        d| _        | j        dz  | _	        d| _        |j
        | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        d S )NFg      ࿩bias)r;   r<   rH   hidden_size	embed_dimnum_attention_heads	num_headsr>   	is_causalr}   attention_dropoutr~   r   Lineark_projv_projq_projo_projrG   rH   rO   s     r0   r<   zPixtralAttention.__init__   s    +3$.8}d*/iUKKKiUKKKiUKKKiUKKKr2   NFhidden_statesr|   position_embeddingsoutput_attentionsr   returnc                    |                                 \  }}}|                     |          }	|                     |          }
|                     |          }|	                    ||| j        | j                                      dd          }	|
                    ||| j        | j                                      dd          }
|                    ||| j        | j                                      dd          }|\  }}t          |	|
||d          \  }	}
t          }| j
        j        dk    rt          | j
        j                 }| j
        j        dk    r%|d                             |j        d	          |d<    || |	|
||f| j        sd
n| j        | j        d|\  }}|                    ||d                                          }|                     |          }|sd}||fS )z#Input shape: Batch x Time x Channelr   r   r   )rs   eagerflash_attention_2r^   T)non_blockingrw   )r~   r}   r   N)sizer   r   r   viewr   r>   r   rv   r   rH   _attn_implementationr   r\   r9   r   r~   r}   r"   r   r   )rG   r   r|   r   r   r   
batch_sizepatches_query_states
key_statesvalue_statesrZ   r[   attention_interfacer   r   s                    r0   r`   zPixtralAttention.forward   s    "/!3!3!5!5
GQ{{=11[[//
{{=11#((Wdndm\\ffghjkll__Z$.$-XXbbcdfghh
#((Wdndm\\ffghjkll&S#7jRUWZjk#l#l#l j(?;+w66"9$+:Z"[ ;+/BBB%+N%;%>%>}?Sbf%>%g%gF>"$7$7	%
  $}>CC$,L	%
 	%
 	%
 	%
!\ "))*grBBMMOOkk+..  	 LL((r2   )NNF)rb   rc   rd   re   r<   r   rf   r   tupleboolr   r	   r`   ri   rj   s   @r0   r   r      s         L L L L L* 26KO,1/) /)|/) !./) &eEL%,,F&GH	/)
 $D>/) -./) 
u|Xel33	4/) /) /) /) /) /) /) /)r2   r   c                   $     e Zd Z fdZd Z xZS )
PixtralMLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _	        t          |j                 | _        d S )NFr   )r;   r<   rH   r   intermediate_sizer   r   	gate_projup_proj	down_projr   
hidden_actact_fnr   s     r0   r<   zPixtralMLP.__init__   s    !-!'!94#3T5KRWXXXy!143IPUVVV4#94;KRWXXXV./r2   c                     |                      |                     |                     |                    |                     |          z            }|S ra   )r   r   r   r   )rG   r]   r   s      r0   r`   zPixtralMLP.forward   sA    NN4;;t~~a/@/@#A#ADLLQROO#STT	r2   )rb   rc   rd   r<   r`   ri   rj   s   @r0   r   r      sG        0 0 0 0 0      r2   r   c                   ,     e Zd Zd fd	Zd Zd Z xZS )PixtralRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z=
        PixtralRMSNorm is equivalent to T5LayerNorm
        N)r;   r<   r   	Parameterr   onesweightvariance_epsilon)rG   r   epsrO   s      r0   r<   zPixtralRMSNorm.__init__   sD     	l5:k#:#:;; #r2   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )Nr   r   T)keepdim)	rU   r\   r   r   powmeanrsqrtr   r   )rG   r   input_dtypevariances       r0   r`   zPixtralRMSNorm.forward  s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r2   c                 H    t          | j        j                   d| j         S )Nz, eps=)r   r   r   r   rG   s    r0   
extra_reprzPixtralRMSNorm.extra_repr  s&    )**II$2GIIIr2   )r   )rb   rc   rd   r<   r`   r   ri   rj   s   @r0   r   r      sb        $ $ $ $ $ $; ; ;J J J J J J Jr2   r   c                        e Zd Z fdZ	 	 d
dej        dej        deeej        ej        f                  dee         de	e
         deej                 fd	Z xZS )PixtralAttentionLayerc                    t                                                       t          |j        d          | _        t          |          | _        t          |          | _        t          |j        d          | _	        d S )Nh㈵>r   )
r;   r<   r   r   attention_normr   feed_forwardr   	attentionffn_normr   s     r0   r<   zPixtralAttentionLayer.__init__  sk    ,V-?TJJJ&v..)&11&v'9tDDDr2   Nr   r|   r   r   r   r   c                     |}|                      |          } | j        d||||d|\  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r|   r   r    )r   r   r   r   )	rG   r   r|   r   r   r   residualr   outputss	            r0   r`   zPixtralAttentionLayer.forward  s    $ !++M::&4dn '
') 3/	'
 '

 '
 '
#| !=0 m44))-88 =0 " 	'&Gr2   )NN)rb   rc   rd   r<   r   rf   r   r   r   r   r	   FloatTensorr`   ri   rj   s   @r0   r   r     s        E E E E E LP,0' '|' ' &eEL%,,F&GH	'
 $D>' -.' 
u 	!' ' ' ' ' ' ' 'r2   r   c                        e Zd Z fdZ	 	 	 	 	 ddeej                 deeej        ej        f                  dee         dee         dee         de	e
         d	eeef         fd
Z xZS )PixtralTransformerc                 &   t                                                       || _        t          j                                        | _        t          |j                  D ])}| j        	                    t          |                     *d| _        d S )NF)r;   r<   rH   r   r   
ModuleListlayersrangenum_hidden_layersr$   r   gradient_checkpointing)rG   rH   r   rO   s      r0   r<   zPixtralTransformer.__init__B  s    h))++v/00 	> 	>AK4V<<====&+###r2   Nr|   r   r   output_hidden_statesreturn_dictr   r   c                 X   ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}	|}
| j        D ]-}|r||
fz   } ||
|f||d|}|d         }
|r|	|d         fz   }	.|r||
fz   }|st          d |
||	fD                       S t          |
||	          S )av  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embeddings which serve as input to the Transformer.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr   )r   r   r   r   c              3      K   | ]}||V  	d S ra   r   ).0vs     r0   	<genexpr>z-PixtralTransformer.forward.<locals>.<genexpr>  s(      eeqWXWdWdWdWdWdeer2   )last_hidden_stater   
attentions)rH   r   r   use_return_dictr   r   r   )rG   inputs_embedsr|   r   r   r   r   r   encoder_statesall_attentionsr   encoder_layerlayer_outputss                r0   r`   zPixtralTransformer.forwardJ  sP   < 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%![ 	F 	FM# C!/=2B!B)M %8"3	 
  M *!,M  F!/=3C2E!E 	?+}.>>N 	fee]NN$Seeeeee+>Vd
 
 
 	
r2   )NNNNN)rb   rc   rd   r<   r   r   rf   r   r   r   r	   r   r   r`   ri   rj   s   @r0   r   r   A  s        , , , , , 26KO,0/3&*?
 ?
 !.?
 &eEL%,,F&GH	?

 $D>?
 'tn?
 d^?
 -.?
 
uo%	&?
 ?
 ?
 ?
 ?
 ?
 ?
 ?
r2   r   c                   R    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdgZdZ	dZ
dZdZd ZdS )PixtralPreTrainedModelrH   modelpixel_valuesTr   c                 j   | j         j        }t          |t          j        t          j        f          rJ|j        j                            d|           |j	         |j	        j        
                                 d S d S t          |t                    r!|j        j                            d           d S d S )Nrw   )r   stdr8   )rH   initializer_rangerV   r   r   Conv2dr   datanormal_r   zero_r   fill_)rG   rx   r   s      r0   _init_weightsz$PixtralPreTrainedModel._init_weights  s    k+fry")455 	*M&&CS&999{& &&((((( '&// 	*M$$S)))))	* 	*r2   N)rb   rc   rd   r   rg   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modulesr   r   r2   r0   r   r     sy         $O&*#"&N01N"&* * * * *r2   r   c                    |j         }|j        }|j        d         }t          j        |          j        }t          j        ||f|||          }t          j        |                               d          }t          j        dg| d d         z                                 d          }t          ||          D ]\  }	}
d||	|
|	|
f<   |d d d d d d f         
                    |j        d         ddd          }|S )Nr   )
fill_valuerU   r9   r   r   )rU   r9   r   r   finfominfulltensorcumsumzipexpand)r&   r  rU   r9   seq_lend_mincausal_maskblock_end_idxblock_start_idxstartends              r0   generate_block_attention_maskr    s	   LE]Fl1oGK"E*gw/EW]^^^KL!233::2>>MlA3):3B3)?#?@@GGKKO/=99 . .
s,-E#IuSy())dD!!!QQQ./66v|A2rRRKr2   c                        e Zd ZdZ fdZd Zee	 	 	 	 ddej	        de
ej	                 de
e         de
e         d	e
e         d
ee         deeef         fd                        Z xZS )PixtralVisionModelvision_encoderc                    t                                          |           || _        t          j        |j        |j        |j        |j        d          | _        |j        | _        t          |j        d          | _
        t          |          | _        t          |          | _        |                                  d S )NF)in_channelsout_channelskernel_sizestrider   r   r   )r;   r<   rH   r   r   num_channelsr   rB   
patch_convr   ln_prer   transformerr4   patch_positional_embedding	post_initr   s     r0   r<   zPixtralVisionModel.__init__  s       )++)$
 
 
 !+$V%7TBBB-f55*@*H*H'r2   c                     | j         S ra   )r"  r   s    r0   get_input_embeddingsz'PixtralVisionModel.get_input_embeddings  s
    r2   Nr   image_sizesr   r   r   r   r   c           	      *    ||j         \  }}	}
}|
|fg|z  }                     |          } fdt          ||          D             }t          j        d |D             d                              d          }                     |          }t          | j        j	         j        j
        z            }||d<                        ||          } j        j        dk    rd }nt          d |D             |          }  j        |f||||d	d
|S )Nc                 l    g | ]0\  }}|d d|d         j         z  d|d         j         z  f         1S ).Nr   r   )rB   )r   embedr   rG   s      r0   
<listcomp>z.PixtralVisionModel.forward.<locals>.<listcomp>  s[     
 
 
t #5$q'T_457U$q'T_:T7UUV
 
 
r2   c                 B    g | ]}|                     d           j        S )r   )flattenTr   r   s     r0   r-  z.PixtralVisionModel.forward.<locals>.<listcomp>  s$    !L!L!LQ!))A,,.!L!L!Lr2   r   r   )r'   r^   r   c                 D    g | ]}|j         d          |j         d         z  S )r   r   )r   r1  s     r0   r-  z.PixtralVisionModel.forward.<locals>.<listcomp>  s)    FFFqqwr{*FFFr2   T)r|   r   r   r   r   )r   r"  r  r   r%   rp   r#  r1   rH   rA   rB   r%  r   r  r$  )rG   r   r)  r   r   r   argsr   r   r   r*   r+   patch_embedsr&   r^   r   r|   s   `                r0   r`   zPixtralVisionModel.forward  s    +7+=(J65"E?+j8K |44
 
 
 
"<==
 
 
 y!L!L:K!L!L!LRSTTT^^_`aa{{<00 0)?4;CY)Y
 
 
 ".~"==lLYY;+/BBB!NN:FF4EFFF N  t
) 3!5/
 
 
 
 	
r2   )NNNN)rb   rc   rd   r   r<   r(  r   r   r   rf   r   r   r   r	   r   r   r   r`   ri   rj   s   @r0   r  r    s        (    "    /3/3,0&*1
 1
l1
 el+1
 'tn	1

 $D>1
 d^1
 -.1
 
uo%	&1
 1
 1
 ^ 1
 1
 1
 1
 1
r2   r  )Nr   )rw   )1re   collections.abcr   typingr   r   r   r   activationsr   modeling_flash_attention_utilsr	   modeling_layersr
   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_pixtralr   
get_loggerrb   loggerr1   Moduler4   rn   rv   rf   rC   r   r   r   r   r   r   r   r  r  __all__r   r2   r0   <module>rD     su     $ $ $ $ $ $ " " " " " " " "        ! ! ! ! ! ! B B B B B B 9 9 9 9 9 9 / / / / / / 6 6 6 6 6 6 F F F F F F F F & & & & & & > > > > > > > > > > 6 6 6 6 6 6 
	H	%	%     2< 2< 2< 2< 2<RY 2< 2< 2<l( ( (   F % %I%<% 
% <	%
 U\*% % % % % %.F) F) F) F) F)ry F) F) F)T       "J J J J JRY J J J(/ / / / /6 / / /dH
 H
 H
 H
 H
 H
 H
 H
V * * * * *_ * * *2    J
 J
 J
 J
 J
/ J
 J
 J
Z  !9
:r2   