
     `i                        d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
m	c mZ d dlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7  e0j8        e9          Z: G d de	j;                  Z< G d de	j;                  Z= G d dej	        j;                  Z> G d de	j;                  Z? G d de	j@                  ZA ed            G d! d e	j;                              ZB G d" d#e	j;                  ZCd$ejD        d%ejD        d&ejD        d'eEejD        ejD        f         fd(ZFd)ejD        d*eGd'ejD        fd+ZH	 d_d-e	j;        d.ejD        d/ejD        d0ejD        d1eejD                 d2eId3eIfd4ZJ	 d_d-e	j;        d.ejD        d/ejD        d0ejD        d1eejD                 d2eId3eIfd5ZK G d6 d7e	j;                  ZL G d8 d9e          ZMe. G d: d;e)                      ZNe. G d< d=eN                      ZO G d> d?eNe          ZPe e.d@A           G dB dCe#                                  ZQ G dD dEej	        j;                  ZR G dF dGe	j;                  ZSdH ZT G dI dJe	j;                  ZUdKejD        d.ejD        fdLZVd.ejD        d/ejD        dKejD        d'eEejD        ejD        f         fdMZW G dN dOe	j;                  ZX G dP dQe	j;                  ZY G dR dSe          ZZ G dT dUe	j;                  Z[ G dV dWe	j;                  Z\ G dX dYe	j;                  Z] G dZ d[eN          Z^ G d\ d]eNe          Z_g d^Z`dS )`    N)	dataclass)CallableOptionalUnion)Llama4VisionConfig   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )Llama4ConfigLlama4TextConfigc                   H     e Zd Zdef fdZdej        dej        fdZ xZS )Llama4TextExpertsconfigc                    t                                                       |j        | _        |j        | _        |j        | _        | j        | _        t          j        t          j
        | j        | j        d| j        z                      | _        t          j        t          j
        | j        | j        | j        f                    | _        t          |j                 | _        d S N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchemptygate_up_proj	down_projr	   
hidden_actact_fnselfr&   	__class__s     ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/llama4/modeling_llama4.pyr+   zLlama4TextExperts.__init__/   s    !3!'!9!-0LT5EtGWYZ]a]lYl)m)mnnek43CT_VZVf2g&h&hiiV./    hidden_statesreturnc                 \   |                     | j        j        d         d| j                  }t	          j        || j                  }|                    dd          \  }}t	          j        ||                     |          z  | j                  }|                     d| j                  }|S )a2  
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        r   r)   dim)	viewr5   shaper/   r3   bmmchunkr8   r6   )r:   r>   gate_upgateupnext_statess         r<   forwardzLlama4TextExperts.forward9   s     &**4+<+B1+Er4K[\\)M4+<=====++bidkk$&7&7!7$.II!&&r4+;<<r=   )	__name__
__module____qualname__r#   r+   r3   TensorrL   __classcell__r;   s   @r<   r%   r%   .   sk        0/ 0 0 0 0 0 0U\ el        r=   r%   c                   &     e Zd Zd fd	Zd Z xZS )Llama4TextMLPNc                 \   t                                                       ||j        }|| _        t	          j        |j        |d          | _        t	          j        |j        |d          | _        t	          j        ||j        d          | _	        t          |j                 | _        d S NFbias)r*   r+   r.   r&   r1   Linearr/   	gate_projup_projr6   r	   r7   activation_fn)r:   r&   r.   r;   s      r<   r+   zLlama4TextMLP.__init__P   s    $ & 86#57HuUUUy!35FUSSS#4f6HuUUU#F$56r=   c                     |                      |                     |                    |                     |          z  }|                     |          S N)r\   rZ   r[   r6   )r:   xr6   s      r<   rL   zLlama4TextMLP.forward\   sB    &&t~~a'8'899DLLOOK	~~i(((r=   r^   rM   rN   rO   r+   rL   rQ   rR   s   @r<   rT   rT   O   sL        
7 
7 
7 
7 
7 
7) ) ) ) ) ) )r=   rT   c                   8     e Zd Zddef fdZd Zd Zd Z xZS )Llama4TextL2Normư>epsc                 V    t                                                       || _        d S r^   )r*   r+   rd   )r:   rd   r;   s     r<   r+   zLlama4TextL2Norm.__init__b   s$    r=   c                     |t          j        |                    d                              dd          | j        z             z  S Nr)   rA   T)keepdimr3   rsqrtpowmeanrd   r:   r_   s     r<   _normzLlama4TextL2Norm._normf   8    5;quuQxx}}R}>>IJJJJr=   c                 v    |                      |                                                              |          S r^   )rn   floattype_asrm   s     r<   rL   zLlama4TextL2Norm.forwardi   s*    zz!''))$$,,Q///r=   c                     d| j          S )Nzeps=rd   r:   s    r<   
extra_reprzLlama4TextL2Norm.extra_reprl   s     dh   r=   )rc   )	rM   rN   rO   rq   r+   rn   rL   rv   rQ   rR   s   @r<   rb   rb   a   sy         E      K K K0 0 0! ! ! ! ! ! !r=   rb   c                   2     e Zd Zd fd	Zd Zd Zd Z xZS )Llama4TextRMSNormh㈵>c                     t                                                       || _        t          j        t          j        |                    | _        dS )z<
        Llama4RMSNorm is equivalent to T5LayerNorm
        N)r*   r+   rd   r1   r2   r3   onesweight)r:   r/   rd   r;   s      r<   r+   zLlama4TextRMSNorm.__init__q   sA     	l5:k#:#:;;r=   c                     |t          j        |                    d                              dd          | j        z             z  S rg   ri   rm   s     r<   rn   zLlama4TextRMSNorm._normy   ro   r=   c                     |                      |                                                              |          }|| j        z  S r^   )rn   rq   rr   r|   )r:   r_   outputs      r<   rL   zLlama4TextRMSNorm.forward|   s6    AGGII&&..q11##r=   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler|   rE   rd   ru   s    r<   rv   zLlama4TextRMSNorm.extra_repr   s%    )**<<$(<<<r=   )ry   )rM   rN   rO   r+   rn   rL   rv   rQ   rR   s   @r<   rx   rx   p   sm        < < < < < <K K K$ $ $= = = = = = =r=   rx   c                   (     e Zd Z fdZ fdZ xZS )Llama4Routerc                     t                                          |j        |j        d           |j        | _        |j        | _        d S rV   )r*   r+   r/   r,   r-   num_experts_per_toktop_kr9   s     r<   r+   zLlama4Router.__init__   sA    +V-EERRR!3/


r=   c                    t                                          |          }t          j        || j        d          \  }}t          j        |t          d                                        d||          }t          j        j	        
                    |                                                              |j                  }||fS )Nr!   rB   z-inf)r*   rL   r3   topkr   	full_likerq   scatter_r1   
functionalsigmoidtodtype)r:   r>   router_logitsrouter_top_valuerouter_indicesrouter_scoresr;   s         r<   rL   zLlama4Router.forward   s    66+0:mTZUV+W+W+W(.uV}}EENNqR`brss+33M4G4G4I4IJJMMmNabbm++r=   r`   rR   s   @r<   r   r      sQ        0 0 0 0 0
, , , , , , , , ,r=   r   Llama4TextMoec                   $     e Zd Z fdZd Z xZS )r   c                    t                                                       |j        | _        |j        | _        |j        | _        t          |          | _	        t          |          | _        t          |          | _        d S r^   )r*   r+   r   r   r/   
hidden_dimr,   r-   r%   expertsr   routerrT   shared_expertr9   s     r<   r+   zLlama4TextMoe.__init__   sl    /
 ,!3(00"6***622r=   c                    |                     d| j                  }|                     |          \  }}|                    |j        d         d          }||                    dd                               dd          z  }|                     |          }|                     |          }|                    |                     |j        d         d|j        d                   	                    d                     ||fS )NrA   r!   r   rB   )
reshaper   r   repeatrE   	transposer   r   add_sum)r:   r>   r   r   	routed_in
routed_outouts          r<   rL   zLlama4TextMoe.forward   s    %--b$/BB'+{{='A'A$}!(()<Q)?CC	 7 71 = = E Eb! L LL	\\),,
  //##M$7$:B
@PQS@TUUYY^_Y``aaaM!!r=   r`   rR   s   @r<   r   r      sG        3 3 3 3 3" " " " " " "r=   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )Llama4TextRotaryEmbeddinginv_freqNr&   c                 b   t                                                       |j        dnd| _        |j        | _        |j        | _        || _        t          | j                 | _	        | 	                    | j        |          \  }| _
        |                     d|d           | j        | _        d S )Nllama3defaultr   F)
persistent)r*   r+   rope_scaling	rope_typemax_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr&   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r:   r&   devicer   r;   s       r<   r+   z"Llama4TextRotaryEmbedding.__init__   s    %+%8%D)"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r=   c                 j   | j         d d d d f                                                             |j        d         dd          }|d d d d d f                                         }t	          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j	        |d          5  |
                    |j                  |z                      dd          }t          j        t          j        |          |          }|| j        z  }d d d            n# 1 swxY w Y   |S )	Nr   rA   r!   mpscpuF)device_typeenabledr)   )r   rq   expandrE   
isinstancer   typestrr3   autocastr   r   polar	ones_liker   )r:   r_   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_ciss           r<   rL   z!Llama4TextRotaryEmbedding.forward   s`    !M$4-8>>@@GGHZ[\H]_acdee ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	; 	;&))!(336KKVVWXZ[\\EEOE$:$:EBBI!D$::I	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	;
 s   9A#D((D,/D,r^   )rM   rN   rO   r3   rP   __annotations__r#   r+   no_gradr   rL   rQ   rR   s   @r<   r   r      s         l/ // / / / / / / U]__
 
  _
 
 
 
 
r=   r   xqxkr   r?   c           	      F   t          j         |                                 j        g | j        d d         ddR            }t          j         |                                j        g |j        d d         ddR            }t          j        ||d d d d d d d f         z                                d          }t          j        ||d d d d d d d f         z                                d          }|                    |           |                    |          fS )NrA   r)   r   )r3   view_as_complexrq   r   rE   view_as_realflattenrr   )r   r   r   xq_xk_xq_outxk_outs          r<   apply_rotary_embr      s   
 
 2

 2 IBHSbSM I2 Iq I I I
J
JC

 2

 2 IBHSbSM I2 Iq I I I
J
JCi111dAAA&> >??GGJJFi111dAAA&> >??GGJJF>>"v~~b1111r=   r>   n_repc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)rE   r   r   )r>   r   batchnum_key_value_headsslenhead_dims         r<   	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr=           modulequerykeyvalueattention_maskscalingdropoutc                    t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
d          }
t          j        	                    |
|| j
                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr)   r   rA   rB   ptrainingr!   )r   num_key_value_groupsr3   matmulr   rE   r1   r   softmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r<   eager_attention_forwardr      s     3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2(>>L=((6?([[L,|\::K''1--88::K$$r=   c                    t          || j                  }t          || j                  }	t          j        ||                    dd                    | j        dz  z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j        	                    |
d          }
t          j        
                    |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )	Nr)   r         r   rA   rB   r   r!   )r   r   r3   r   r   r   rE   r1   r   r   r   r   r   r   s                r<   vision_eager_attention_forwardr      s    3 ;<<JUF$?@@L<z';';Aq'A'ABBV_VZEZZL!$QQQ111.D
0@0D.D%DE#k1=((2(>>L=((6?([[L,|\::K''1--88::K$$r=   c                   @    e Zd ZdZdef fdZ eddd          	 	 dd	ej        d
e	ej        ej        f         de
ej                 de
e         de
ej                 dee         de	ej        e
ej                 e
e	ej                          f         fd            Z xZS )Llama4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr&   c                    t                                                       || _        || _        t	          |d|j        |j        z            | _        |j        | _        |j        |j        z  | _	        |j        | _        | j        dz  | _
        |j        | _        |j        | _        |j        | _        |j        | _        d| _        |j        |         | _        t%          j        |j        |j        | j        z  |j                  | _        t%          j        |j        |j        | j        z  |j                  | _        t%          j        |j        |j        | j        z  |j                  | _        t%          j        |j        | j        z  |j        |j                  | _        | j        j        r"| j        rt5          |j                  | _        d S d S d S )Nr   r   TrW   )r*   r+   r&   	layer_idxgetattrr/   num_attention_headsr   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper1   rY   attention_biasq_projk_projv_projo_projuse_qk_normrb   rms_norm_epsqk_normr:   r&   r   r;   s      r<   r+   zLlama4TextAttention.__init__  s   "
F4F&Jd4dee#)#= $*$>&B\$\!#)#= }d* +!-'-'E$!'!9-i8i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i&68JQWQf
 
 
 ;" 	At} 	A+F,?@@DLLL	A 	A 	A 	Ar=   past_key_valuepast_key_values4.58new_nameversionNr>   position_embeddingsr   cache_positionr   r?   c                 T   |j         d d         }g |d| j        R }|                     |                              |          }	 |                     |          j        g |d| j        R  }
|                     |                              |                              dd          }| j        r,t          |	|
|	                    |	j
                            \  }	}
t          | d          r*|                     |	          }	|                     |
          }
| j        r| j        st          j        t          j        |                                dz   | j        z                      | j        z  dz   }|                    d|d         ddf                              g |ddR           }|	|z  	                    |	j                  }	|	                    dd          }	|
                    dd          }
|$d|i}|                    |
|| j        |          \  }
}t0          }| j        j        dk    rt6          | j        j                 } || |	|
||f| j        sdn| j        | j        d	|\  }} |j        g |dR                                   }| !                    |          }||fS )
NrA   r!   r)   r	        ?r  eagerr   )r   r   )"rE   r   r  rD   r  r  r   r  r   r   r   hasattrr	  r   r3   log1pfloorrq   r   r   r   r   updater   r   r&   _attn_implementationr   r   r   r   r   r   r  )r:   r>   r  r   r  r  r   input_shapehidden_shapequery_statesr   r   attn_scalescache_kwargsattention_interfacer   r   s                    r<   rL   zLlama4TextAttention.forward8  s    $)#2#.88b8$-88{{=1166|DD4T[[//4UkU2Ut}UUU
{{=1166|DDNNqRSTT= 	'7j*=*@*@AT*U*U( ($L* 4## 	2<<55Lj11J ' 	O 	OEK)=)=)?)?#)EIY(YZZ[[^b^mmpss  &**A{21+EFFMMNbP[Nb]^Nb`aNbNbccK(;6::<;MNNL#--a33))!Q//
&,n=L'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r=   NN)rM   rN   rO   __doc__r#   r+   r   r3   rP   r   r   r
   
LongTensorr   r   rL   rQ   rR   s   @r<   r   r     s"       GGA/ A A A A A A< _%0A6RRR ,0599) 9)|9) #5<#=>9) !.	9)
 "%9) !!129) -.9) 
u|Xel3XeEL>Q5RR	S9) 9) 9) SR9) 9) 9) 9) 9)r=   r   c                   j    e Zd Z fdZ eddd          	 	 	 	 	 	 ddej        d	eej                 d
eej                 dee	         dee
         deej                 deeej        ej        f                  dee         deej        eeej        ej        f                  f         fd            Z xZS )Llama4TextDecoderLayerc                    t                                                       |j        | _        || _        |j        |         | _        t          ||          | _        ||j        v | _	        | j	        rt          |          | _        nt          ||j                  | _        t          |j        |j                  | _        t          |j        |j                  | _        d S )N)r.   rt   )r*   r+   r/   r   layer_typesattention_typer   	self_attn
moe_layersis_moe_layerr   feed_forwardrT   intermediate_size_mlprx   r  input_layernormpost_attention_layernormr
  s      r<   r+   zLlama4TextDecoderLayer.__init__v  s    !-"$0;,VY??%):: 	f -f 5 5D -fHd e e eD01CI\]]](9&:LRXRe(f(f(f%%%r=   r  r  r  r  NFr>   r   r   	use_cacher  r  r   r?   c           
         |}	|                      |          } | j        d||||||d|\  }
}|	|
z   }|}	|                     |          }|                     |          }| j        r|\  }}|	|                    |	j                  z   }|S )N)r>   r  r   r  r0  r   )r.  r)  r/  r,  r+  rD   rE   )r:   r>   r   r   r  r0  r  r  r   residualattention_states_s               r<   rL   zLlama4TextDecoderLayer.forward  s     !,,];; -dn 
' 3)+)
 
 
 
! !#33 !55mDD))-88 	-,M1 =#5#5hn#E#EEr=   )NNNFNN)rM   rN   rO   r+   r   r3   rP   r   r#  r
   boolr   r   r   FloatTensorrL   rQ   rR   s   @r<   r%  r%  u  s?       g g g g g _%0A6RRR 2637+/$)59KO" "|" !." u/0	"
 "%" D>" !!12" &eEL%,,F&GH" -." 
u (51BEDU1U+V"WW	X" " " SR" " " " "r=   r%  c                   >    e Zd ZU eed<   dZdgZdZdZdZ	dZ
dZd ZdS )Llama4PreTrainedModelr&   Tr  Fc                    t          | j        d          r| j        j        n| j        j        j        }t	          |t
          j                  rJ|j        j        	                    d|           |j
         |j
        j                                         d S d S t	          |t
          j                  rU|j        j        	                    d|           |j        +|j        j        |j                                                  d S d S t	          |t
          j                  r?|j        j                            d           |j
        j                                         d S t	          |t                     r!|j        j                            d           d S t	          |t"                    rD|j        j        	                    d|           |j        j        	                    d|           d S t	          |t(                    rL|j        j        	                    |j                   |j        j        	                    |j                   d S d S )Ninitializer_ranger   )rl   stdr  )r<  )r  r&   r;  text_configr   r1   rY   r|   datanormal_rX   zero_	Embeddingpadding_idx	LayerNormfill_rx   r%   r5   r6   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r:   r   r<  s      r<   _init_weightsz#Llama4PreTrainedModel._init_weights  s8    t{$788;DK))(: 	
 fbi(( 	KM&&CS&999{& &&((((( '&-- 	KM&&CS&999!-"6#56<<>>>>> .--- 
	KM$$S)))K""$$$$$ 122 	KM$$S))))) 122 	K$,,#3,???!))s)<<<<< 122 	K"'//FL/AAA+088V\8JJJJJ	K 	Kr=   N)rM   rN   rO   r"   r   supports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendrI  r2  r=   r<   r9  r9    sb         &*##4"5 N!"&K K K K Kr=   r9  c                   T    e Zd ZU dgZdZeed<   eee	dZ
def fdZeee	 	 	 	 	 	 	 ddeej                 deej                 d	eej                 d
ee         deej                 dee         deej                 dee         deeef         fd                                    Z xZS )Llama4TextModelr%  modelr&   )
attentionsr>   r   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        t!                    | _        d| _        |                                  d S )Nc                 0    g | ]}t          |          S r2  )r%  ).0r   r&   s     r<   
<listcomp>z,Llama4TextModel.__init__.<locals>.<listcomp>  s$    hhh9#FI66hhhr=   rt   r&   F)r*   r+   pad_token_idrB  
vocab_sizer1   rA  r/   embed_tokens
ModuleListrangenum_hidden_layerslayersrx   r  normr   
rotary_embgradient_checkpointing	post_initr9   s    `r<   r+   zLlama4TextModel.__init__  s       !. +L):F<NPTP`aamhhhhfNfHgHghhh
 
 &f&8f>QRRR	36BBB&+# 	r=   N	input_idsr   r   r  inputs_embedsr0  r  r   r?   c                    |d u |d uz  rt          d          |7|                     |                    | j        j        j                            }|r|t          | j                  }|B||                                nd}	t          j	        |	|	|j
        d         z   |j                  }||                    d          }t          |x}
t                    s'| j        |||||d}t          d
i |t          d
i |d}
|}|                     ||          }| j        d | j        j                 D ]} ||f|
|j                 |||||d|}|                     |          }t+          ||r|nd 	          S )N:You must specify exactly one of input_ids or inputs_embedsrY  r   r!   )r   )r&   input_embedsr   r  r  r   )full_attentionchunked_attention)r   r   r  r0  r  r  )last_hidden_stater  r2  )
ValueErrorr\  r   r|   r   r   r&   get_seq_lengthr3   arangerE   	unsqueezer   dictr   r   rb  r`  r_  r(  ra  r   )r:   re  r   r   r  rf  r0  r  r   past_seen_tokenscausal_mask_mappingmask_kwargsr>   freq_cisdecoder_layers                  r<   rL   zLlama4TextModel.forward  s    -t";< 	[YZZZ  --ill4;L;S;Z.[.[\\M 	?0*$+>>>O!CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L ?-FF 	 + -"0"0#2 , K #5"C"C{"C"C%?%N%N+%N%N# #
 & ??=,??![)H4;+H)HI 
	 
	M)M	2=3OP) /#-$,	 	 	 	MM 		-00&+/8BOOd
 
 
 	
r=   )NNNNNNN)rM   rN   rO   _no_split_modulesbase_model_prefixr#   r   r   r%  r   _can_record_outputsr+   r   r    r   r   r3   r#  rP   r
   r7  r6  r   r   r   r   r   rL   rQ   rR   s   @r<   rR  rR    st        12)/& /         151537+/59$(59C
 C
E,-C
 !.C
 u/0	C

 "%C
   12C
 D>C
 !!12C
 +,C
 
u--	.C
 C
 C
 ^  C
 C
 C
 C
 C
r=   rR  c                       e Zd ZU dgZdZdgZddiZeed<   def fdZ	e
e	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deeeeej                 f                  deej                 deej                 dee         deej                 deeej        f         dee         deeef         fd                        Z xZS )Llama4ForCausalLMr%  language_modelzlm_head.weightlm_headcolwise_repr&   c                     t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        | 	                                 d S rV   )
r*   r+   rR  rS  r[  r1   rY   r/   r}  rd  r9   s     r<   r+   zLlama4ForCausalLM.__init__=  sj       $V,,
 +y!3V5FUSSS 	r=   Nr   re  r   r   r  rf  labelsr0  r  logits_to_keepr   r?   c
                 T    | j         d|||||||d|
}|d         }t          |	t                    rt          |	 d          n|	}|                     |dd|ddf                   }d}| | j        d||| j        j        d|
}t          |||j	        |j
        |j                  S )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Llama4ForCausalLM

        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)re  r   r   r  rf  r0  r  r   N)logitsr  r[  )lossr  r  r>   rT  r2  )rS  r   intslicer}  loss_functionr&   r[  r   r  r>   rT  )r:   re  r   r   r  rf  r  r0  r  r  r   outputsr>   slice_indicesr  r  s                   r<   rL   zLlama4ForCausalLM.forwardF  s    J $* 	
)%+')	
 	
 	
 	
  
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%pVFt{OeppioppD%#3!/)
 
 
 	
r=   )	NNNNNNNNr   )rM   rN   rO   rw  rx  _tied_weights_keys_tp_planr#   r   r+   r   r   r   r3   r#  rP   r   r
   listr7  r6  r  r   r   r   r   rL   rQ   rR   s   @r<   r{  r{  6  s        12(*+=)H/        151537KO59-1$(5934<
 <
E,-<
 !.<
 u/0	<

 "%tE4E/F(F"GH<
   12<
 )*<
 D><
 !!12<
 c5</0<
 +,<
 
u,,	-<
 <
 <
 ^ <
 <
 <
 <
 <
r=   r{  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeej                 ed<   dS )	Llama4CausalLMOutputWithPasta3  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr  r  r  r>   rT  image_hidden_states)rM   rN   rO   r"  r  r   r3   r7  r   r  r  r
   r>   r   rT  r  r2  r=   r<   r  r    s           )-D(5$
%,,,*.FHU&'...'+OXe_+++8<M8E%"345<<<59Ju0129997;%"34;;;;;r=   r  c                   $     e Zd Z fdZd Z xZS )Llama4VisionMLP2c                 X   t                                                       |j        | _        |j        | _        t	          j        | j        |j        d          | _        t	          j        |j        |j        d          | _	        t	          j
                    | _        |j        | _        d S rV   )r*   r+   r/   r.   r1   rY   projector_input_dimfc1projector_output_dimfc2GELUr\   projector_dropoutr   r9   s     r<   r+   zLlama4VisionMLP2.__init__  s    !-!'!99T3V5OV[\\\9V8&:U\abbbWYY/r=   c                     |                      |          }|                     |          }t          j        || j        | j                  }|                     |                     |                    S )Nr   )r  r\   Fr   r   r  r:   r>   s     r<   rL   zLlama4VisionMLP2.forward  s`    //**=99	-4<$-XXX!!$((="9"9:::r=   r`   rR   s   @r<   r  r    sG        0 0 0 0 0; ; ; ; ; ; ;r=   r  c                   $     e Zd Z fdZd Z xZS )Llama4MultiModalProjectorc                     t                                                       t          j        |j        j        |j        j        d          | _        d S rV   )	r*   r+   r1   rY   vision_configvision_output_dimr=  r/   linear_1r9   s     r<   r+   z"Llama4MultiModalProjector.__init__  sJ    	 2*
 
 
r=   c                 0    |                      |          }|S r^   )r  )r:   image_featuresr>   s      r<   rL   z!Llama4MultiModalProjector.forward  s    n55r=   r`   rR   s   @r<   r  r    sG        
 
 
 
 
      r=   r  c           
         | j         \  }}}t          t          j        |                    }|                     |||d          } |                                 \  }}}}|                     ||t          ||z            t          ||z                      }|                    dddd                                          }|                    |t          ||z            t          ||z            t          ||dz  z                      }|                    dddd                                          }|                    |d|j         d                   }	|	S )NrA   r   r)   r!   r   )rE   r  mathsqrtrD   sizepermuter   )
input_tensorshuffle_ratio
batch_sizenum_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensors
             r<   pixel_shuffler    sO   (4(:%JXTY{++,,J$$ZZLLL*6*;*;*=*='Jx"''
FC@U<V<VX[\dgt\tXuXuvvO%--aAq99DDFFO%**C.//U]5J1K1KSQY]jlm]mQnMoMo O &--aAq99DDFFO#((R9Nr9RSSMr=   c                   B     e Zd Z fdZdej        dej        fdZ xZS )Llama4VisionPixelShuffleMLPc                     t                                                       |j        | _        t          |j        | j        dz  z            | _        |j        | _        t          |          | _	        d S r(   )
r*   r+   pixel_shuffle_ratior  r  	inner_dimr  
output_dimr  mlpr9   s     r<   r+   z$Llama4VisionPixelShuffleMLP.__init__  sa    #)#= V7D<TVW<WXYY 5#F++r=   encoded_patchesr?   c                 V    t          || j                  }|                     |          S r^   )r  r  r  )r:   r  s     r<   rL   z#Llama4VisionPixelShuffleMLP.forward  s&    '9QRRxx(((r=   rM   rN   rO   r+   r3   rP   rL   rQ   rR   s   @r<   r  r    s^        , , , , ,)u| ) ) ) ) ) ) ) ) )r=   r  freqs_cic                 f    |j         fdt          |j                  D             } | j        | S )Nc                 <    g | ]\  }}|d k    s	|d z
  k    r|nd S )r!   r2  )rW  idndims      r<   rX  z)reshape_for_broadcast.<locals>.<listcomp>  s5    TTTTQ!q&&AMMQQqTTTr=   )r  	enumeraterE   rD   )r  r   rE   r  s      @r<   reshape_for_broadcastr    s<    :DTTTTYu{=S=STTTE8=%  r=   c                 \   t          j         |                                 j        g | j        d d         ddR            }t          j         |                                j        g |j        d d         ddR            }t          ||          }|                    |j                  }t          j        ||z            	                    d          }t          j        ||z            	                    d          }|
                    |           |
                    |          fS )NrA   r)   )r  r   r   )r3   r   rq   r   rE   r  r   r   r   r   rr   )r   r   r  query_key_	query_outkey_outs          r<   vision_apply_rotary_embr    s   
 "#85;;==#8#R%+crc:J#RB#RPQ#R#R#RSSF !4!4!Lcin!Lb!L!!L!L!LMMD$hfEEEH{{6=))H"6H#455==a@@I 1199!<<GU##W__S%9%999r=   c                        e Zd Zdef fdZ	 	 ddej        dej        deej                 dee         de	e
         d	eej        eej                 eeej                          f         fd
Z xZS )Llama4VisionAttentionr&   c                 b   t                                                       || _        |j        | _        |j        | _        |j        |j        z  | _        d| _        |j	        | _	        | j        dz  | _
        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        z  | j        d          | _        d S )Nr!   r   TrW   )r*   r+   r&   r/   	embed_dimr   	num_headsr   r   r   r   r1   rY   r  r  r  r  r9   s     r<   r+   zLlama4VisionAttention.__init__  s    +3*f.HH$%!!'!9}d*i0NUYZZZi0NUYZZZi0NUYZZZi >UYZZZr=   Nr>   r  r   r  r   r?   c                    |j         d d         }g |d| j        R }|                     |                              |          }|                     |                              |          }	|                     |                              |          }
t          ||	|          \  }}	|                    dd          }|	                    dd          }	|
                    dd          }
t          }| j	        j
        dvrt          | j	        j
                 } || ||	|
d f| j        sdn| j        d dd|\  }} |j        g |dR                                  }|                     |          }||fS )	NrA   )r  r!   r)   )r  flex_attentionr   F)r   r   r   )rE   r   r  rD   r  r  r  r   r   r&   r  r   r   r   r   r   r  )r:   r>   r  r   r  r   r  r  r  r   r   r   r   r   s                 r<   rL   zLlama4VisionAttention.forward
  s    $)#2#.88b8$-88{{=1166|DD[[//44\BB
{{=1166|DD#:<^f#g#g#g j#--a33))!Q//
#--a33(F;+3NNN"9$+:Z"[$7$7
%
  $}HCC$2H
%
 
%
 
%
 
%
!\ *k);;;;;;FFHHkk+..L((r=   r!  )rM   rN   rO   r   r+   r3   rP   r   r
   r   r   r   rL   rQ   rR   s   @r<   r  r    s        [1 [ [ [ [ [ [& 26+/() ()|() ,() !.	()
 "%() -.() 
u|Xel3XeEL>Q5RR	S() () () () () () () ()r=   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )Llama4VisionMLPc                    t                                                       || _        t          j                    | _        t          j        |j        |j        d          | _	        t          j        |j        |j        d          | _
        d S )NTrW   )r*   r+   r&   r1   r  r\   rY   r/   r.   r  r  r9   s     r<   r+   zLlama4VisionMLP.__init__6  sp    WYY9V/1IPTUUU9V5v7IPTUUUr=   r>   r?   c                     |                      |          }|                     |          }|                     |          }|S r^   )r  r\   r  r  s     r<   rL   zLlama4VisionMLP.forward=  s=    //**=99//r=   r  rR   s   @r<   r  r  5  sc        V V V V VU\ el        r=   r  c            
       x     e Zd Zdef fdZ	 	 d	dej        dej        deej                 dee         fdZ	 xZ
S )
Llama4VisionEncoderLayerr&   c                 (   t                                                       |j        | _        t          |          | _        t          |          | _        t          j        |j                  | _	        t          j        |j                  | _
        d S r^   )r*   r+   r/   r  r)  r  r  r1   rC  r.  r/  r9   s     r<   r+   z!Llama4VisionEncoderLayer.__init__E  sr    !-.v66"6**!|F,>??(*V5G(H(H%%%r=   Nhidden_stater  r   output_attentionsc                     |}|                      |          }|                     |||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )N)r  r   )r.  r)  r/  r  )r:   r  r  r   r  r3  r   r  s           r<   rL   z Llama4VisionEncoderLayer.forwardO  s      ++L99%)^^) &4 &
 &
"l
  ,.  44\BBxx--,./ 	'&Gr=   r!  )rM   rN   rO   r   r+   r3   rP   r   r6  rL   rQ   rR   s   @r<   r  r  D  s        I1 I I I I I I 26,0 l , !.	
 $D>       r=   r  c                        e Zd ZdZdef fdZ	 	 	 	 ddej        dej        deej                 dee	         d	ee	         d
ee	         de
eef         fdZ xZS )Llama4VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    r&   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        | _        d S )Nc                 .    g | ]}t                    S r2  )r  )rW  r5  r&   s     r<   rX  z0Llama4VisionEncoder.__init__.<locals>.<listcomp>|  s"    $o$o$o!%=f%E%E$o$o$or=   F)	r*   r+   r&   r1   r]  r^  r_  r`  rc  r9   s    `r<   r+   zLlama4VisionEncoder.__init__y  sf    m$o$o$o$ouU[UmOnOn$o$o$opp&+#r=   Nr>   r  r   r  output_hidden_statesreturn_dictr?   c                 X   ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}| j        D ]/}	|r||fz   } |	||||          }
|r||
d         fz   }|
d         }0|r||fz   }|st          d |||fD                       S t          |||          S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr2  )r  r   r  r  r!   r   c              3      K   | ]}||V  	d S r^   r2  rW  vs     r<   	<genexpr>z.Llama4VisionEncoder.forward.<locals>.<genexpr>  s(      eeqWXWdWdWdWdWdeer=   rl  r>   rT  )r&   r  r  use_return_dictr`  r   r   )r:   r>   r  r   r  r  r  encoder_statesall_attentionsencoder_layerlayer_outputss              r<   rL   zLlama4VisionEncoder.forward  s9   > 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d![ 	- 	-M# C!/=2B!B)M*-"3!	  M ! F!/=3C2E!E)!,MM 	?+}.>>N 	fee]NN$Seeeeee+>Vd
 
 
 	
r=   NNNN)rM   rN   rO   r"  r   r+   r3   rP   r   r6  r   r   r   rL   rQ   rR   s   @r<   r  r  p  s         1       26,0/3&*?
 ?
|?
 ,?
 !.	?

 $D>?
 'tn?
 d^?
 
uo%	&?
 ?
 ?
 ?
 ?
 ?
 ?
 ?
r=   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )Llama4UnfoldConvolutionc                 N   t                                                       |j        }t          |t                    r||f}t
          j                            ||j                  | _        t          j	        |j
        |d         z  |d         z  |j        d          | _        d S )N)kernel_sizestrider   r!   FrW   )r*   r+   r  r   r  r3   r1   UnfoldunfoldrY   num_channelsr/   linear)r:   r&   r  r;   s      r<   r+   z Llama4UnfoldConvolution.__init__  s    'k3'' 	5&4Khoo+fFWoXXi+a.0;q>A
 
 
r=   r>   r?   c                     |                      |          }|                    ddd          }|                     |          }|S )Nr   r)   r!   )r  r  r  r  s     r<   rL   zLlama4UnfoldConvolution.forward  sA    M22%--aA66M22r=   r  rR   s   @r<   r  r    s^        

 

 

 

 

U\ el        r=   r  c                   $     e Zd Z fdZd Z xZS )Llama4VisionRotaryEmbeddingc                 ,   t                                                       |j        |j        z  }t	          j        |dz  t          j                                      |dz  d          }t	          j        ||d d         gd          }d|d<   ||z  }||z  }|j	        |j
        z  dz  }d|j        t	          j        d|d          d |dz                                           |z  z  z  }|dz   d	         |d d d d f         z                      dd
          }|dz   d	         |d d d d f         z                      dd
          }	t	          j        ||	gd
                                                                          dd d df         }
|
                    |                    d
dd          dk     d          }
t	          j        t	          j        t	          j        |
          t	          j        |
          gd
                    }|| _        d S )Nr)   )r   r!   r   rB   r   )rA   rA   r  ).NrA   .)r*   r+   
image_sizer  r3   ro  int32r   catr/   r   
rope_thetarq   repeat_interleaver   masked_fillr   stackcossinr  )r:   r&   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   ru  r;   s               r<   r+   z$Llama4VisionRotaryEmbedding.__init__  s   6#44,sAvU[999AA#q&!LL)Wgbqbk2:::#3%)CCqH6,a11M1MN_QY]^Q^N_1`1f1f1h1hks1stu	!A%y1IdD!!!m4LL__`agi_jj!A%y1IdD!!!m4LL__`agi_jj	7G,"555;;==HHJJ3PSPSRSPS8T!!'//"a";";a"?CC(ei6F6F	RWHXHX5Y_a)b)b)bcc r=   c                 @    | j                             |j                  S r^   )r  r   r   r  s     r<   rL   z#Llama4VisionRotaryEmbedding.forward  s    } 4555r=   r`   rR   s   @r<   r  r    sG        ! ! ! ! !"6 6 6 6 6 6 6r=   r  c                        e Zd ZU dZdgZeed<   def fdZd Z	 	 	 	 dde	j
        dee	j
                 d	ee         d
ee         dee         deeee	j
        df         f         fdZ xZS )rE  vision_modelr  r&   c                 (   t                                          |           |j        | _        |j        | _        |j        | _        |j        | _        | j        | j        z  dz  dz   | _        |j        dz  | _        t          |          | _	        t          j        | j        t          j        | j                  z            | _        t          j        | j        t          j        | j        | j                  z            | _        t!          |          | _        t          j        | j                  | _        t          j        | j                  | _        t+          |          | _        t/          |          | _        |                                  d S )Nr)   r!   r   )r*   r+   r  r  r/   r  r  rG  r  patch_embeddingr1   r2   r3   randnrF  rH  r  rotary_embeddingrC  layernorm_prelayernorm_postr  rS  r  vision_adapterrd  r9   s     r<   r+   zLlama4VisionModel.__init__  sD       + +!-"/ Ot>1DqH'-
6v>>!|DJTEU9V9V,VWW(*TZ%+dN^`d`pBqBq5q(r(r% ;F C C  \$*:;; l4+;<< )00
9&AAr=   c                     | j         S )zg
        This function is used to fetch the first embedding layer to activate grads on inputs.
        )r  ru   s    r<   get_input_embeddingsz&Llama4VisionModel.get_input_embeddings
  s     ##r=   Npixel_valuesr   r  r  r  r?   .c                    ||n| j         j        }||n| j         j        }||n| j         j        }|j        \  }}}}	d}
d}|                     |          }|j        \  }}}|                    ||
z  |z  ||          }| j                            |j        d         d|j        d                   }t          j
        ||gd          }|dz  }|                    ||
z  |||          }| j                            |j        |j                  }||z   }|                     |          }|                    |d|          }|                     |          }|                     |d|||          }|j        }|                     |          }|ddddddf         }|                     |          }|r|j        nd}|r	|d         }nd}|st/          d	 |||fD                       S t1          |||
          S )a  

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, MllamaVisionModel

        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
        >>> processor = AutoProcessor.from_pretrained(checkpoint)

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> output = model(**inputs)

        >>> print(output.last_hidden_state.shape)
        torch.Size([1, 1, 4, 1025, 7680])
        ```
        Nr!   r   rA   rB   r   r   )r   r  r  r  r)   c              3      K   | ]}||V  	d S r^   r2  r  s     r<   r  z,Llama4VisionModel.forward.<locals>.<genexpr>i  s(      __qQRQ^Q^Q^Q^Q^__r=   r  )r&   r  r  r  rE   r  r   rF  r   r3   r  rH  r   r   r   r  rD   r  rS  rl  r  r  r>   r   r   )r:   r  r   r  r  r  batch_size_times_num_tilesr  r  r  num_concurrent_media
num_chunksr  r5  r  r   rF  positional_embeddingr  r   r>   rT  s                         r<   rL   zLlama4VisionModel.forward  s   > 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] COBT?"L&% 
++L99%1%7";
 $++&)==
JKYc
 
 .55l6H6KQP\PbcePfggy,!@aHHHq $++&)==z;Xb
 
  $<??lFXamat?uu#&::)),77#(()CRTT((66!5/  
 
 /**<88#AAAssAAAI. **<880DN,,$ 	JJJ 	`__\=*$M______*'!
 
 
 	
r=   r  )rM   rN   rO   rx  rw  r   r   r+   r  r3   rP   r   r6  r   r   r   rL   rQ   rR   s   @r<   rE  rE    s        &341      2$ $ $ 26,0/3&*_
 _
l_
 !._
 $D>	_

 'tn_
 d^_
 
elC&7 88	9_
 _
 _
 _
 _
 _
 _
 _
r=   rE  c            '           e Zd ZU ddgZi ZdZeed<   def fdZd Z	d Z
d Zd	 Zd
 Zd Zdej        defdZdej        dej        dej        fdZe edd          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&deej                 deej                 deej                 deej                 dee         deej                 deeeee         f                  dee         deej                 dee         dee         dee         dee         d eej                 d!eeej        f         d"ee         d#ee e!f         f"d$                        Z"	 	 	 	 	 	 d'd%Z# xZ$S )(Llama4ForConditionalGenerationr%  r   r&   c                 ^   t                                          |           t          |j                  | _        t          |          | _        t          |j                  | _	        |j        j
        | _
        | j        j        | j        j        nd| _        |                                  d S )NrA   )r*   r+   rE  r  r  r  multi_modal_projectorr{  r=  r|  r[  r&   rZ  rd  r9   s     r<   r+   z'Llama4ForConditionalGeneration.__init__x  s       -f.BCC%>v%F%F"/0BCC ,78<8P8\DK44bdr=   c                 4    | j                                         S r^   )r|  r  ru   s    r<   r  z3Llama4ForConditionalGeneration.get_input_embeddings  s    "77999r=   c                 :    | j                             |           d S r^   )r|  set_input_embeddings)r:   r   s     r<   r)  z3Llama4ForConditionalGeneration.set_input_embeddings  s    0077777r=   c                 4    | j                                         S r^   )r|  get_output_embeddingsru   s    r<   r+  z4Llama4ForConditionalGeneration.get_output_embeddings  s    "88:::r=   c                 :    | j                             |           d S r^   )r|  set_output_embeddings)r:   new_embeddingss     r<   r-  z4Llama4ForConditionalGeneration.set_output_embeddings  s    11.AAAAAr=   c                 :    | j                             |           d S r^   )r|  set_decoder)r:   decoders     r<   r0  z*Llama4ForConditionalGeneration.set_decoder  s    ''00000r=   c                 4    | j                                         S r^   )r|  get_decoderru   s    r<   r3  z*Llama4ForConditionalGeneration.get_decoder  s    "..000r=   r  vision_feature_select_strategyc                     |dvrt          d| j                   d |                                D             } | j        |fddi|}|j        }|S )aj  
        Obtains image last hidden states from the vision tower and apply al projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_select_strategy (`str`):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r   fullz$Unexpected select feature strategy: c                     i | ]
\  }}|||S r^   r2  )rW  kr  s      r<   
<dictcomp>zELlama4ForConditionalGeneration.get_image_features.<locals>.<dictcomp>  s    CCC41aQ]!Q]]]r=   r  F)rm  r4  itemsr  rl  )r:   r  r4  r   image_outputsr  s         r<   get_image_featuresz1Llama4ForConditionalGeneration.get_image_features  ss    $ *1DDDiDDgiijjjCC6<<>>CCC)),]]U]V\]]$6r=   re  rf  r  c                 <   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                                }|	                    d          
                    |                              |j                  }||                                         |                                k    r t          d| d|j        d                    |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr  rA   z6Image features and image tokens do not match: tokens: z, features r   )r  r3   tensorr&   image_token_idlongr   allr   rp  	expand_asr   numelrm  rE   )r:   re  rf  r  special_image_maskn_image_tokenss         r<   get_placeholder_maskz3Llama4ForConditionalGeneration.get_placeholder_mask  s    !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;!*dk.H!H+//11/99"==GGVVYYZgZnoo+,22448L8L8N8NNN}}}drdxyzd{}}   "!r=   vision_feature_layerr  )r  Nr   r   r   r  r  r0  r  r  r  r  r  r   r?   c                 
   ||n| j         j        }||n| j         j        }||n| j         j        }||n| j         j        j        }|du |duz  rt          d          ||t          d          | |                                 |          }||                     ||          }|	                    d|
                    d                    }|                     |                              |j        |j                  }|                     |||          }|                    ||          } | j        d|||||
|||||d
|}|d         }d}|	k||dd|j        d	         d	z
   df                             |j                  }|d
ddddf         |                    |j                  dk                                             }|	d
d	df         |                    |	j                  dk                                             }n?|d
ddddf                                         }|	d
d	df                                         }t)          j                    } ||	                    d|
                    d                    |	                    d                              |j                            }|s|f|d	d         z   }||f|z   n|S t-          |||j        |j        |j        ||nd          S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```Nrh  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)r  r4  rA   )rf  r  )
r   r   r  rf  r0  r  r  r  r  r  r   r!   .)r  r  r  r>   rT  r  r2  )r&   r  r  r  r  r4  rm  r  r<  rD   r  r&  r   r   r   rF  masked_scatterr|  rE   r   r1   CrossEntropyLossr  r  r>   rT  )r:   re  r  r   r   r  rf  rG  r4  r  r0  r  r  r  r  r  r   r  vision_flatprojected_vision_flatrD  r  r  r  shift_attention_maskshift_logitsshift_labelsloss_fctr   s                                r<   rL   z&Llama4ForConditionalGeneration.forward  s   b 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] .9 +**I 	' -t";< 	[YZZZ#(Av    7D5577	BBM#!44)/M 5  N
 )--b.2E2Eb2I2IJJK$($>$>{$K$K$N$N$m&9% %! "&!:!:G\ "; " " *889KMbccM%$% 
)%+'/!5#))
 
 
 
 ) (6aaa6<?Q;N9O9Q9Q6Q'R'U'UV\Vc'd'd$%c3B3k23G3J3J6=3Y3Y]^3^_jjll%c122g/C/F/Fv}/U/UYZ/Z[ffhh%c3B3k2==??%c122g99;;*,,H8!!"l&7&7&;&;<<l>O>OPR>S>S>V>VWcWj>k>k D  	DY,F'+'7D7V##VC+#3!/)2>2JPT
 
 
 	
r=   c           	      Z     | j         j        |f|||||d|}	|d         dk    r||	d<   |	S )N)r  rf  r   r  r  r   r  )r|  prepare_inputs_for_generation)
r:   re  r  rf  r  r   r  r  r   model_inputss
             r<   rR  z<Llama4ForConditionalGeneration.prepare_inputs_for_generationJ  sf     It*H
+')))
 
 
 
 !!! ,8L(r=   )NNNNNNNNNNNNNNr   )NNNNNN)%rM   rN   rO   rw  r  rx  r"   r   r+   r  r)  r+  r-  r0  r3  r3   r7  r   r<  r#  rF  r   r   r   rP   r
   r   r  r  r6  r   r   r   r  rL   rR  rQ   rR   s   @r<   r#  r#  r  s        13MNH	| 	 	 	 	 	 	: : :8 8 8; ; ;B B B1 1 11 1 1' ),   2")":?:K"]b]n" " " ". _+V<<< 15481537+/59@D8<-1$(,0/3&*5934!A
 A
E,-A
 u01A
 !.	A

 u/0A
 "%A
   12A
 'uS$s)^'<=A
 )1A
 )*A
 D>A
 $D>A
 'tnA
 d^A
 !!12A
  c5</0!A
" +,#A
$ 
u22	3%A
 A
 A
 =< ^A
L        r=   r#  )r9  rR  rE  r{  r#  )r   )ar  dataclassesr   typingr   r   r   r3   torch.nnr1   torch.nn.functionalr   r  /transformers.models.llama4.configuration_llama4r   activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr    configuration_llama4r"   r#   
get_loggerrM   loggerModuler%   rT   rb   rx   rY   r   r   r   rP   r   r   r  r   rq   r   r   r   r%  r9  rR  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  rE  r#  __all__r2  r=   r<   <module>rl     s     ! ! ! ! ! ! , , , , , , , , , ,                 N N N N N N ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) 7 7 7 7 7 7 K K K K K K K K B B B B B B 9 9 9 9 9 9 m m m m m m m m m m m m K K K K K K K K F F F F F F F F & & & & & & R R R R R R R R R R R R 0 0 0 0 0 0 / / / / / / @ @ @ @ @ @ @ @ 
	H	%	%    	   B) ) ) ) )BI ) ) )$! ! ! ! !ux ! ! != = = = =	 = = =(, , , , ,29 , , , _--" " " " "BI " " .-"*    	   B	2	2	2 |	2 5<%&		2 	2 	2 	2	UU\ 	U# 	U%, 	U 	U 	U 	U( % %I%<% 
% <	%
 U\*% % % % % %D % %I%<% 
% <	%
 U\*% % % % % %4[) [) [) [) [)") [) [) [)|3 3 3 3 37 3 3 3l #K #K #K #K #KO #K #K #KL `
 `
 `
 `
 `
+ `
 `
 `
FN
 N
 N
 N
 N
- N
 N
 N
b   
< < < < <; < <  <0; ; ; ; ;ux ; ; ;"    	     (
) 
) 
) 
) 
)") 
) 
) 
)!EL ! ! ! ! !:<:	: l: 5<%&	: : : :8) 8) 8) 8) 8)BI 8) 8) 8)v    bi   ) ) ) ) )9 ) ) )XO
 O
 O
 O
 O
") O
 O
 O
d    bi   (6 6 6 6 6") 6 6 6,C
 C
 C
 C
 C
- C
 C
 C
Lt t t t t%:O t t tn  r=   