
     `ig                         d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
m	c mZ ddlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 d Z1d`dZ2dej3        de4dej3        fdZ5	 dade	j6        dej3        dej3        dej3        deej3                 d e7d!e7d"e$e&         fd#Z8 G d$ d%e	j6                  Z9 ed&           G d' d(e	j6                              Z: G d) d*e	j6                  Z; G d+ d,e          Z< G d- d.e	j6                  Z= G d/ d0e	j6                  Z> G d1 d2e	j6                  Z? G d3 d4e	j6                  Z@ G d5 d6e	j6                  ZA G d7 d8e	j6                  ZB G d9 d:e	j6                  ZC G d; d<e	j6                  ZD G d= d>e	j6                  ZE G d? d@e	j6                  ZF G dA dBe	jG                  ZH G dC dDe	j6                  ZI G dE dFe	j6                  ZJ G dG dHe	j6                  ZK G dI dJe	j6                  ZL G dK dLe	j6                  ZM e'dMN           G dO dPe"                      ZN G dQ dR          ZOe' G dS dTe"                      ZP G dU dVe	j6                  ZQe' G dW dXeP                      ZRe' G dY dZePe                      ZS G d[ d\eP          ZT G d] d^ePe          ZUg d_ZVdS )b    N)cached_property)CallableOptionalUnion   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1x2s      z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/emu3/modeling_emu3.pyrotate_halfr+   /   s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''    c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer+   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r*   apply_rotary_pos_embr7   6   sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr,   hidden_statesn_repreturnc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r$   expandreshape)r8   r9   batchnum_key_value_headsslenhead_dims         r*   	repeat_kvrB   Q   s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr,           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr!   r   r    )r#   dtype)ptrainingr   )rB   num_key_value_groupsr%   matmul	transposer$   nn
functionalsoftmaxfloat32torN   rJ   rP   
contiguous)rD   rE   rF   rG   rH   rI   rJ   rK   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r*   eager_attention_forwardr_   ]   s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r,   c                       e Zd ZdZdedef fdZ eddd          	 	 dd
ej	        de
ej	        ej	        f         deej	                 dee         deej                 dee         de
ej	        ej	        f         fd            Z xZS )Emu3Attention=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    t                                                       || _        || _        t	          |d|j        |j        z            | _        |j        |j        z  | _	        | j        dz  | _
        |j        | _        d| _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        | j        z  |j        |j                  | _        d S )NrA         Tbias)super__init__rc   rd   getattrhidden_sizenum_attention_headsrA   r?   rQ   rI   attention_dropout	is_causalrT   Linearattention_biasq_projk_projv_projo_projselfrc   rd   	__class__s      r*   rj   zEmu3Attention.__init__z   sB   "
F4F&Jd4dee$*$>&B\$\!}d*!'!9i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i&68JQWQf
 
 
r,   past_key_valuepast_key_values4.58new_nameversionNr8   position_embeddingsrH   cache_positionrK   r:   c                 D   |j         d d         }g |d| j        R }|                     |                              |                              dd          }	|                     |                              |                              dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|                    |
|| j	        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        sdn| j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )Nr    r   r!   )r2   r1   r   eagerrC   )rJ   rI   )r$   rA   rr   viewrS   rs   rt   r7   updaterd   r_   rc   _attn_implementationr   rP   rn   rI   r=   rY   ru   )rw   r8   r   rH   rz   r   rK   input_shapehidden_shapequery_statesrZ   r[   r1   r2   cache_kwargsattention_interfacer^   r\   s                     r*   forwardzEmu3Attention.forward   s    $)#2#.88b8$-88{{=1166|DDNNqRSTT[[//44\BBLLQPQRR
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r,   NN)__name__
__module____qualname____doc__r   intrj   r   r%   Tensortupler   r	   
LongTensorr   r   r   __classcell__rx   s   @r*   ra   ra   w   s       GG
z 
c 
 
 
 
 
 
. _%0A6RRR ,059)) ))|)) #5<#=>)) !.	))
 "%)) !!12)) +,)) 
u|U\)	*)) )) )) SR)) )) )) )) ))r,   ra   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Emu3RMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z:
        Emu3RMSNorm is equivalent to T5LayerNorm
        N)ri   rj   rT   	Parameterr%   onesweightvariance_epsilon)rw   rl   epsrx   s      r*   rj   zEmu3RMSNorm.__init__   sD     	l5:k#:#:;; #r,   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )Nr!   r    T)keepdim)	rN   rX   r%   rW   powmeanrsqrtr   r   )rw   r8   input_dtypevariances       r*   r   zEmu3RMSNorm.forward   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r,   c                 H    t          | j        j                   d| j         S )Nz, eps=)r   r   r$   r   rw   s    r*   
extra_reprzEmu3RMSNorm.extra_repr   s&    )**II$2GIIIr,   )r   )r   r   r   rj   r   r   r   r   s   @r*   r   r      sb        $ $ $ $ $ $; ; ;J J J J J J Jr,   r   c                   $     e Zd Z fdZd Z xZS )Emu3MLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _	        t          j        | j        | j        |j                  | _
        t          |j                 | _        d S )Nrg   )ri   rj   rc   rl   intermediate_sizerT   rp   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnrw   rc   rx   s     r*   rj   zEmu3MLP.__init__   s    !-!'!94#3T5KRXRabbby!143IPVP_```4#94;KRXRabbbV./r,   c                     |                      |                     |                     |                    |                     |          z            }|S N)r   r   r   r   )rw   r'   r   s      r*   r   zEmu3MLP.forward   sA    NN4;;t~~a/@/@#A#ADLLQROO#STT	r,   r   r   r   rj   r   r   r   s   @r*   r   r      sG        0 0 0 0 0      r,   r   c                   4    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	ej
                 de	eej        ej        f                  dee         dej        fd            Z xZS )Emu3DecoderLayerrc   rd   c                 p   t                                                       |j        | _        t          ||          | _        t          |          | _        t          |j        |j                  | _	        t          |j        |j                  | _
        t          j        |j                  | _        d S )N)rc   rd   r   )ri   rj   rl   ra   	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernormrT   Dropoutrn   rJ   rv   s      r*   rj   zEmu3DecoderLayer.__init__   s    !-&f	JJJ6??*6+=6CVWWW(3F4FFL_(`(`(`%z&":;;r,   ry   rz   r{   r|   NFr8   rH   r3   	use_cacher   r   rK   r:   c                    |}	|                      |          } | j        d|||||||d|\  }}
|	|                     |          z   }|}	|                     |          }|                     |          }|	|                     |          z   }|S )N)r8   rH   r3   rz   r   r   r    )r   r   rJ   r   r   )rw   r8   rH   r3   rz   r   r   r   rK   residual_s              r*   r   zEmu3DecoderLayer.forward   s     !,,];;)4> 	
')%+) 3	
 	
 	
 	
q !4<<#>#>> 55mDD// 4<<#>#>>r,   )NNNFNN)r   r   r   r   r   rj   r   r%   r   r   r   r	   boolr   r   r   r   r   r   s   @r*   r   r      s-       	<z 	<c 	< 	< 	< 	< 	< 	< _%0A6RRR 2637+/$)59KO | !. u/0	
 "% D> !!12 &eEL%,,F&GH +, 
   SR    r,   r   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )Emu3VQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    rc   c                     t                                                       t          j        |j        |j                  | _        | j        j        j        	                    d|j        z  d|j        z             d S )Ng            ?)
ri   rj   rT   	Embeddingcodebook_size	embed_dim	embeddingr   datauniform_r   s     r*   rj   z!Emu3VQVAEVectorQuantizer.__init__  sf    f&:F<LMM"++D63G,GvOcIcdddddr,   hidden_statec                    |j         \  }}}}}|                    ddddd                                          }|                    d|          }t	          j        |dz  dd          }t	          j        | j        j        dz  d	          }	dt	          j        || j        j        	                    dd                    z  }
||	z   |
z
  }
t	          j
        |
d	          }|                    ||||          }|S )
Nr   r   r      r!   r    T)r#   r   r"   )r$   permuterY   r   r%   sumr   r   rR   rS   argmin)rw   r   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r*   r   z Emu3VQVAEVectorQuantizer.forward!  s   8D8J5
Hh#++Aq!Q::EEGG!-!2!22x!@!@ !9%;Q%>AtTTT	$."7":BBB %;T^=R=\=\]^`a=b=bccc	$}4y@	$|I1===388XvW\]]##r,   )
r   r   r   r   r   rj   r%   r   r   r   r   s   @r*   r   r     sr         e e e e e e e
$EL $ $ $ $ $ $ $ $r,   r   c                   $     e Zd Z fdZd Z xZS )Emu3VQVAEEncoderConvDownsamplec                     t                                                       t          j        ||ddd          | _        d S )Nr   r!   r   kernel_sizestridepaddingri   rj   rT   Conv2dconvrw   in_channelsrx   s     r*   rj   z'Emu3VQVAEEncoderConvDownsample.__init__4  :    Ik;AaYZ[[[			r,   c                 `    t          j        |ddd          }|                     |          }|S )N)r   r   r   r   constantr   )padmoderG   )Fr   r   rw   r8   s     r*   r   z&Emu3VQVAEEncoderConvDownsample.forward8  s2    mJVWXXX		-00r,   r   r   s   @r*   r   r   3  sL        \ \ \ \ \      r,   r   c                   $     e Zd Z fdZd Z xZS )Emu3VQVAEEncoderConvUpsamplec                     t                                                       t          j        ||ddd          | _        d S )Nr   r   r   r   r   s     r*   rj   z%Emu3VQVAEEncoderConvUpsample.__init__@  r   r,   c                 ^    t          j        |dd          }|                     |          }|S )N       @nearestscale_factorr   )r   interpolater   r   s     r*   r   z$Emu3VQVAEEncoderConvUpsample.forwardD  s/    m#IVVV		-00r,   r   r   s   @r*   r   r   ?  sL        \ \ \ \ \      r,   r   c            	       ^     e Zd Zdededee         dee         f fdZdej        fdZ xZ	S )Emu3VQVAEConv3d
in_channelout_channelr   r   c                 \   t                                                       d t          |dd          |dd                    D             }d| _        |d d d         D ] }| xj        |dz  |dz  z   |dz  fz  c_        !| xj        dz  c_        t	          j        ||||          | _        d S )Nc                     g | ]
\  }}||z
  S r   r   ).0
one_kernel
one_strides      r*   
<listcomp>z,Emu3VQVAEConv3d.__init__.<locals>.<listcomp>T  s"    ppp5KZj0pppr,   r   r   r    r!   )r!   r   )r   )ri   rj   zipr   rT   Conv3dr   )rw   r  r  r   r   padding_sizespad_sizerx   s          r*   rj   zEmu3VQVAEConv3d.__init__K  s     	ppsS^_`_a_aSbdjklkmkmdnOoOoppp%ddd+ 	J 	JHLLX]X\98q=IILLLI	
 
 
			r,   r8   c                 d    t          j        || j                  }|                     |          }|S r   )r   r   r   r   r   s     r*   r   zEmu3VQVAEConv3d.forwarda  s,    mT\::		-00r,   )
r   r   r   r   r   rj   r%   r   r   r   r   s   @r*   r   r   J  s        

 
 3Z	

 c

 
 
 
 
 
,U\        r,   r   c                   L     e Zd Zdedef fdZdej        dej        fdZ xZS )Emu3VQVAESpatialNormr   out_channelsc                     t                                                       t          j        |ddd          | _        t          j        ||ddd          | _        t          j        ||ddd          | _        d S )N    r   Tnum_channels
num_groupsr   affiner   r   r   )ri   rj   rT   	GroupNorm
norm_layerr   conv_yconv_brw   r   r  rx   s      r*   rj   zEmu3VQVAESpatialNorm.__init__h  s    
 	,%	
 
 
 i
 
 
 i
 
 
r,   r8   quant_statesc                     t          j        ||j        dd          d          }|                     |          }||                     |          z  |                     |          z   }|S )NrM   r   )sizer   )r   r   r$   r  r  r  )rw   r8   r  s      r*   r   zEmu3VQVAESpatialNorm.forward  sd    }\8KBCC8PW`aaa66%L(A(AADKKP\D]D]]r,   	r   r   r   r   rj   r%   r   r   r   r   s   @r*   r  r  g  su        

 
 
 
 
 
 
8U\         r,   r  c                   >     e Zd Zdedef fdZdej        fdZ xZS )Emu3VQVAETemporalUpsampler  r  c                 x    t                                                       t          ||dd          | _        d S )Nr   r   r   r   r   r   r   r   ri   rj   r   r   rw   r  r  rx   s      r*   rj   z"Emu3VQVAETemporalUpsample.__init__  A    
 	#!	
 
 
			r,   r8   c                 |   |j         \  }}}}}|                    ddddd                                                              |d|          }t	          j        |dd	          }|                    ||||d                              ddddd                                          }|                     |          }|S )
Nr   r   r   r   r!   r    r   r   r   )r$   r   rY   r   r   r   r   )rw   r8   r   r   r   r   r   s          r*   r   z!Emu3VQVAETemporalUpsample.forward  s    8E8K5
Hh%--aAq!<<GGIINNz[]_ghhm#IVVV%**:xPRSS[[\]_`bcefhijjuuww		-00r,   r  r   s   @r*   r!  r!    sl        

 
 
 
 
 
 
U\        r,   r!  c                   >     e Zd Zdedef fdZdej        fdZ xZS )Emu3VQVAETemporalDownsampler  r  c                 x    t                                                       t          ||dd          | _        d S )N)r   r   r   )r!   r   r   r%  r&  r'  s      r*   rj   z$Emu3VQVAETemporalDownsample.__init__  r(  r,   r8   c                 0    |                      |          }|S r   )r   r   s     r*   r   z#Emu3VQVAETemporalDownsample.forward  s    		-00r,   r  r   s   @r*   r+  r+    sl        

 
 
 
 
 
 
U\        r,   r+  c                   (     e Zd Z	 d fd	Zd Z xZS )Emu3VQVAETemporalResnetBlockNc                    t                                                       || _        ||n|| _        t	          j        |          | _        t          ||dd          | _        t	          j        |          | _	        t          ||dd          | _
        | j        | j        k    r t	          j        ||ddd          | _        d S d S )Nr#  r$  r%  r   r   r   )ri   rj   r   r  rT   BatchNorm3dnorm1r   conv1norm2conv2r
  nin_shortcutr  s      r*   rj   z%Emu3VQVAETemporalResnetBlock.__init__  s    
 	&+7+?KK\^K00
$!	
 
 

 ^L11
$!	
 
 

 t000 "	! ! !D 10r,   c                 ^   |}|                      |          }|t          j        |          z  }|                     |          }|                     |          }|t          j        |          z  }|                     |          }| j        | j        k    r|                     |          }||z   S r   )	r2  r%   sigmoidr3  r4  r5  r   r  r6  )rw   r8   r   s      r*   r   z$Emu3VQVAETemporalResnetBlock.forward  s     

=11}555

=11

=11}555

=11t000((22H-''r,   r   r   r   s   @r*   r/  r/    sR              @( ( ( ( ( ( (r,   r/  c                   |     e Zd Z	 	 ddedee         dee         f fdZd	dej        deej                 fdZ xZ	S )
Emu3VQVAEResnetBlockNr   r  quant_channelsc                 $   t                                                       || _        ||n|}|| _        || _        |;t          j        |ddd          | _        t          j        |ddd          | _        n*t          ||          | _        t          ||          | _        t          j
        ||ddd          | _        t          j
        ||ddd          | _        | j        | j        k    r t          j
        ||ddd          | _        d S d S )	Nr  r   Tr  r   r   r   r   )ri   rj   r   r  r;  rT   r  r2  r4  r  r   r3  r5  r6  )rw   r   r  r;  rx   s       r*   rj   zEmu3VQVAEResnetBlock.__init__  s<    	&&2&:{{(,!;2SW`deeeDJ<BTXaefffDJJ-nkJJDJ-nlKKDJY
 
 

 Y
 
 

 t000 "	! ! !D 10r,   r8   c                 Z   | j         dn|f}|} | j        |g|R  }|t          j        |          z  }|                     |          } | j        |g|R  }|t          j        |          z  }|                     |          }| j        | j        k    r| 	                    |          }||z   S Nr   )
r;  r2  r%   r8  r3  r4  r5  r   r  r6  )rw   r8   r;  	norm_argsr   s        r*   r   zEmu3VQVAEResnetBlock.forward  s    -5BBN;L	 "
==9===}555

=11"
==9===}555

=11t000((22H-''r,   r   r   )
r   r   r   r   r   rj   r%   r   r   r   r   s   @r*   r:  r:    s         '+(,	* ** sm* !	* * * * * *X( (U\ (8ELCY ( ( ( ( ( ( ( (r,   r:  c            
            e Zd ZdZdef fdZ	 d	dej        deej                 de	ej        eej                 f         fdZ
 xZS )
Emu3VQVAEAttentionBlockrb   rc   c                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d| _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rf   Fr   )ri   rj   rc   rl   r   rm   	num_headsrA   
ValueErrorscalern   rJ   ro   rT   rp   rs   rt   rr   out_projrQ   r   s     r*   rj   z Emu3VQVAEAttentionBlock.__init__(  s    +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AA %&!!!r,   Nr8   rH   r:   c           
         |j         \  }}}|                     |          }|                     |          }|                     |          }	|                    ||| j        | j                                      dd          }|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	t          }
| j	        j
        dk    rt          | j	        j
                 }
 |
| |||	|| j        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }||fS )z#Input shape: Batch x Time x Channelr   r!   r   rC   )ro   rI   rJ   )r$   rr   rs   rt   r   rC  rA   rS   r_   rc   r   r   ro   rE  rP   rJ   r=   rY   rF  )rw   r8   rH   rK   r   
seq_lengthr   querieskeysvaluesr   r^   r\   s                r*   r   zEmu3VQVAEAttentionBlock.forward?  sy    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00L((r,   r   )r   r   r   r   r   rj   r%   r   r   r   r   r   r   s   @r*   rA  rA  %  s        GG& & & & & & &4 26$) $)|$) !.$)
 
u|Xel33	4$) $) $) $) $) $) $) $)r,   rA  c                   *     e Zd ZdZ fdZddZ xZS )Emu3VQVAEGroupNormz
    Same as the torch GroupNorm with the only difference that this ones accepts
    an optional kwarg `quant_states` which is not used. This class makes it easier to
    use SpatialNorm or GroupNorm without conditionals
    c                 :     t                      j        di | d S r>  )ri   rj   )rw   rK   rx   s     r*   rj   zEmu3VQVAEGroupNorm.__init__m  s&    ""6"""""r,   Nc                 Z    t          j        || j        | j        | j        | j                  S r   )r   
group_normr  r   rh   r   )rw   inputr  s      r*   r   zEmu3VQVAEGroupNorm.forwardp  s"    |E4?DKDHUUUr,   r   )r   r   r   r   rj   r   r   r   s   @r*   rM  rM  f  s^         # # # # #V V V V V V V Vr,   rM  c                   R     e Zd Zd fd	Zddej        deej                 fdZ xZS )Emu3VQVAEMiddleBlockNc                 ,   t                                                       t          |||          | _        t	          |          | _        |t          |ddd          | _        nt          ||          | _        t          |||          | _	        d S )Nr   r  r;  r  r   Tr  )
ri   rj   r:  block_1rA  attn_1rM  	attn_normr  block_2)rw   rc   r   r;  rx   s       r*   rj   zEmu3VQVAEMiddleBlock.__init__u  s    +#$)
 
 

 .f55!/[UW]ajnoooDNN1.+NNDN+#$)
 
 
r,   r8   r  c                    |                      ||          }|}|                     ||          }|j        \  }}}}|                    ||||z                                dd          }|                     |          d         }|                    ||||                              dddd          }||z   }|                     ||          }|S )Nr   r!   r   r   )	rV  rX  r$   r   rS   rW  r=   r   rY  )rw   r8   r  r   r   r   r   r   s           r*   r   zEmu3VQVAEMiddleBlock.forward  s    ]LAA }lCC.;.A+
Hfe%**:x%PPZZ[\^_``M2215%--j&%RRZZ[\^_abdeff =0]LAAr,   r   )	r   r   r   rj   r%   FloatTensorr   r   r   r   s   @r*   rS  rS  t  so        
 
 
 
 
 
(
 
U%6 
huO`Fa 
 
 
 
 
 
 
 
r,   rS  c                   4     e Zd Z fdZdej        fdZ xZS )Emu3VQVAEDownBlockc           
         t                                                       t          |j                  | _        |j        | _        |j        }|j        }dt          |          z   }|| _        t          j
                    | _        t          | j                  D ]P}t          j
                    }t          j
                    }t          j
                    }|||         z  }	|||         z  }
t          | j                  D ]}|                    t          |	|
                     |
}	|j        V||j        v rM|                    t!          |                     |                    t          j        |	ddd                     t          j                    }||_        ||_        ||_        || j        dz
  k    rt-          |	          |_        | j                            |           Rd S )Nr   r   r  r  r   Tr  r   )ri   rj   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsr   in_channel_multiplierrT   
ModuleListdownrangeappendr:  attn_resolutionsrA  r  Moduleblockattn
attn_normsr   
downsample)rw   rc   re  rb  rf  i_levelrm  rn  ro  block_in	block_outi_blockrh  rx   s                r*   rj   zEmu3VQVAEDownBlock.__init__  s   "6#<==$3,#6 $u-?'@'@ @%:"MOO	T122 	# 	#GMOOE=??DJ$'<W'EEH%(:7(CCI !455 
q 
q($,%.     %*67fF];];]KK 7 ? ?@@@%%blUW]ajn&o&o&oppp9;;DDJDI(DO$.222"@"J"JIT""""1	# 	#r,   r8   c                 P   t          | j                  D ]\  }}t          | j                  D ]} |j        |         |          }t          |j                  dk    r|} |j        |         |          }|j        \  }}}}	|	                    ||||	z            
                    dd          } |j        |         |          d         }|                    |||	|                              dddd          }||z   }|| j        dz
  k    r|                    |          }|S )Nr   r   r!   r   )	enumeraterh  ri  rd  rm  ra  rn  ro  r$   r   rS   r=   r   rc  rp  )
rw   r8   rq  blocksrt  r   r   r   r   r   s
             r*   r   zEmu3VQVAEDownBlock.forward  sF   (33 	A 	AOGV !455 = = 5W 5m D Dv{##a'',H$>F$5g$>}$M$MM:G:M7J&%$1$6$6z8VV[^$\$\$f$fghjk$l$lM$8FK$8$G$G$JM$1$9$9*feU]$^$^$f$fghjkmnpq$r$rM$,}$<M$.222 & 1 1- @ @r,   r   r   r   rj   r%   r[  r   r   r   s   @r*   r]  r]    sW        ## ## ## ## ##JU%6        r,   r]  c                   B     e Zd Z fdZdej        dej        fdZ xZS )Emu3VQVAEUpBlockc           	         t                                                       t          |j                  | _        |j        | _        |j        }|j        |j        d         z  }t          j	                    | _
        t          t          | j                            D ]=}t          j	                    }t          j	                    }t          j	                    }|j        |j        |         z  }t          | j        dz             D ]w}	|                    t          |||                     |}||j        v rE|                    t!          |                     |                    t#          ||                     xt          j                    }
||
_        ||
_        ||
_        |dk    rt-          |          |
_        | j
                            d|
           ?d S )Nr    r   rU  r   )ri   rj   ra  rb  rc  rd  r   re  rT   rg  upreversedri  rj  r:  rk  rA  r  rl  rm  rn  ro  r   upsampleinsert)rw   rc   r;  rr  rq  rm  rn  ro  rs  rt  r|  rx   s              r*   rj   zEmu3VQVAEUpBlock.__init__  s   "6#<==$3)'&*CB*GG-//d&: ; ;<< 	" 	"GMOOE=??DJ,v/H/QQI !4q!899 V V($,%.'5     %f555KK 7 ? ?@@@%%&:>8&T&TUUUBBHBG&BM!||:8DDGNN1b!!!!3	" 	"r,   r8   r  c                    t          | j        d d d                   D ]!\  }}t          | j        dz             D ]} |j        |         ||          }t          |j                  dk    r|} |j        |         ||          }|j        \  }}}	}
|	                    |||	|
z            
                    dd          } |j        |         |          d         }|                    ||	|
|                              dddd          }||z   }|t          | j                  dz
  k    r|                    |          }#|S )Nr    r   r   r!   r   )rv  r|  ri  rd  rm  ra  rn  ro  r$   r   rS   r=   r   r~  )rw   r8   r  rq  rw  rt  r   r   r   r   r   s              r*   r   zEmu3VQVAEUpBlock.forward  sZ   (277 	? 	?OGV !4q!899 = = 5W 5m\ R Rv{##a'',H$>F$5g$>}l$[$[M:G:M7J&%$1$6$6z8VV[^$\$\$f$fghjk$l$lM$8FK$8$G$G$JM$1$9$9*feU]$^$^$f$fghjkmnpq$r$rM$,}$<M#dg,,*** & > >r,   rx  r   s   @r*   rz  rz    sa        #" #" #" #" #"JU%6 eFW        r,   rz  c                   4     e Zd Z fdZdej        fdZ xZS )Emu3VQVAEEncoderc                    t                                                       |j        }|j        }|j        }|j        }|j        }|rd|z  n|}||d         z  }t          j        	                    ||ddd          | _
        t          |          | _        t          ||          | _        t          j                            d|dd	          | _        t          j        	                    ||ddd          | _        t%          t'          j        |j                            }	t          j                    | _        t          j                    | _        t3          |	          D ],}
t5          ||          }| j                            |           -t3          |j                  D ]-}t;          ||
          }| j                            |           .d S )Nr!   r    r   r   r   r  r   T)r  r  r   r  r`  )ri   rj   re  r   double_latentlatent_channelsrb  r%   rT   r   conv_inr]  
down_blockrS  middle_blockr  norm_outconv_outr   mathlog2temporal_downsample_factorrg  	time_convtime_res_stackri  r+  rj  rd  r/  )rw   rc   re  r   r  r  rb  r  rr  temporal_down_blocksir   r   time_res_convrx   s                 r*   rj   zEmu3VQVAEEncoder.__init__  s   ,(, 0#6.;Pq?** #5b#99x{MqYZdeff,V440BB**bxUYbf*gg ( 
 
  #49V-N#O#OPP moo+,, 	( 	(A.|\JJDN!!$''''v,-- 	6 	6A8()  M &&}5555	6 	6r,   pixel_valuesc                 t   |j         d         } |j        dg|j         dd          R  }|                     |          }|                     |          }|                     |          }|                     |          }|t          j        |          z  }|                     |          } |j        d|g|j         dd          R  }|	                    ddddd          }| j
        D ]$} ||          }|t          j        |          z  }%| j        D ]} ||          }|	                    ddddd          }|S )Nr   r    r!   r   r   r   )r$   r=   r  r  r  r  r%   r8  r  r   r  r  )rw   r  temporal_dimr8   r   layers         r*   r   zEmu3VQVAEEncoder.forward3  sf   #)!,+|+BH1CABB1GHHH \2266))-88 m44}555m44--b,YATUVUWUWAXYYY%--aAq!<< N 	: 	:D D//MU]=999MM( 	1 	1E!E-00MM%--aAq!<<r,   )r   r   r   rj   r%   r   r   r   r   s   @r*   r  r    sW        %6 %6 %6 %6 %6NE$4        r,   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )Emu3VQVAEDecoderrc   c                    t                                                       |j        }|j        |j        d         z  }t          j                    | _        t          |j	                  D ]7}t          |j        |j                  }| j                            |           8t          t          j        |j                            }t          j                    | _        t          |          D ]6}t%          |j        |j                  }| j                            |           7t          j        |j        |ddd          | _        t+          |||          | _        t/          |          | _        |j        |j        d         z  }t3          ||          | _        t          j        ||j        ddd          | _        d S )Nr    r`  r   r   r   )r;  r   )ri   rj   r   re  rb  rT   rg  r  ri  rd  r/  r  rj  r   r  r  r  r  r!  r   r  rS  r  rz  up_blockr  r  r  r  )
rw   rc   r;  rr  r   r  temp_upsample_block_numr  r   rx   s
            r*   rj   zEmu3VQVAEDecoder.__init__R  s   )'&*CB*GG moov,-- 	6 	6A8"2AW  M &&}5555"%di0Q&R&R"S"S.// 	( 	(A,V-CVE[\\DN!!$''''y"
 
 
 1R`aaa(00'&*CA*FF,^XFF	
 
 
r,   r8   r  c                    t          j        ||fd          }|                    ddddd          }| j        D ]} ||          }| j        D ]$} ||          }|t          j        |          z  }%|                    ddddd          }t          j        |dd          \  }} |j        dg|j        dd          R  } |j        dg|j        dd          R  }| 	                    |          }| 
                    ||          }|                     ||          }|                     ||          }|t          j        |          z  }|                     |          }|S )Nr   r"   r!   r   r   r   r    )r%   r&   r   r  r  r8  chunkr=   r$   r  r  r  r  r  )rw   r8   r  hidden_quant_statesr  s        r*   r   zEmu3VQVAEDecoder.forwardy  s   #i(E1MMM199!Q1aHH ( 	= 	=E"'%(;"<"<^ 	F 	FE"'%(;"<"<5=1D#E#EE199!Q1aHH&+k2Eqa&P&P&P#|--bK=3Fqrr3JKKK+|+BH1CABB1GHHH]33 ))-FFm\BBm\BB}555m44r,   )	r   r   r   r   rj   r%   r   r   r   r   s   @r*   r  r  Q  sk        %
 %
 %
 %
 %
 %
 %
NU\         r,   r  aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    )custom_introc                        e Zd ZU eed<   dZdZdZdZdZ	dZ
g dZd Zdef fdZdej        dej        fd	Zd
ej        fdZ xZS )	Emu3VQVAErc   
emuvideovqr  T)r/  rA  r:  r   c                    t          |t          j        t          j        f          rt          j                            |j        dd           |j        gt          j                            |j                  \  }}dt          j
        |          z  }t          j                            |j        | |           d S d S t          |t          j                  rt          j                            |j        t          j
        d                     |j        ot          j                            |j                  \  }}|dk    rdt          j
        |          z  nd}t          j                            |j        | |           d S d S t          |t          j        t          j        t          j        f          rLt          j                            |j        d           t          j                            |j        d	           d S t          |t          j                  rP|j        j                                         |j        -|j        j        |j                                                  d S d S d S )
Nfan_outrelu)r   nonlinearityr      )ar   r   rC   )
isinstancerT   r   r
  initkaiming_normal_r   rh   _calculate_fan_in_and_fan_outr  sqrtr   rp   kaiming_uniform_BatchNorm2dr1  r  	constant_r   r   normal_padding_idxzero_)rw   rD   fan_inr   bounds        r*   _init_weightszEmu3VQVAE._init_weights  s   fry")455 	?G##FM	PV#WWW{&GAA&-PP	DIf---  ufe<<<<< '& 	** 	?G$$V]dill$CCC{&GAA&-PP	17!DIf----  ufe<<<<< '&  NOO 	?GfmS111Gfk3/////-- 	?M&&(((!-"6#56<<>>>>>	? 	?--r,   c                 $   t                                          |           || _        t          |          | _        t          |          | _        t          |          | _        dt          |j
                  dz
  z  | _        t          |j        |j        dd          | _        t          |j        |j        dd          | _        dt          |j
                  dz
  z  | _        |                                  |                                  d S )Nr!   r   )r   r   r   r$  r%  )ri   rj   rc   r  encoderr  decoderr   quantizera  rb  vision_spatial_factorr   r  r   
quant_convpost_quant_convspatial_scale_factoreval	post_initr   s     r*   rj   zEmu3VQVAE.__init__  s       '//'//088%&3v/H+I+IA+M%N")"F$4)T]
 
 
  /f4)T] 
  
  
 %&#f.G*H*H1*L$M!		r,   image_sizesc                     |j         dk    }|rE j        j        }|j        \  }}}}|                    d                              d|ddd          }n|j        \  }}}}}                     |          }	|	                    ddddd          }	                     |	          }	|	                    ddddd          }	 	                    |	          }
|r|

                    d          n|
} fdt          ||          D             }|S )Nr   r   r   r!   r   c           	          g | ]I\  }}|d t          |d         j        z            d t          |d         j        z            f         JS )Nr   r   )r   r  )r  single_imager  rw   s      r*   r  z$Emu3VQVAE.encode.<locals>.<listcomp>  sm     
 
 
"d D3tAw)CCDDDFqDQRGVZVpLpHqHqFqqr
 
 
r,   )ndimrc   r  r$   r.   repeatr  r   r  r  squeezer	  )rw   r  r  is_imager   r   r   r   r   r8   codesimage_tokenss   `           r*   encodezEmu3VQVAE.encode  s4   $) 	O{=H2>2D/J&%'11!44;;AxAqQQLL<H<N9J(FE\22 &--aAq!<<66 &--aAq!<<m,,+3>u}}Q'''
 
 
 
&),&D&D
 
 

 r,   r8   c                    |j         dk    }|r|                    d          }|j        \  }}}}| j                            |                                          }|j        d         }|                    |||||                              ddddd                                          }| 	                    |          }	|                    ddddd          }|	                    ddddd          }	| 
                    |	|          }
|
                    ||| j        j        z  | j        j        || j        z  || j        z            }
|r|
d d df         n|
S )Nr   r   r    r   r   r!   )r  r.   r$   r  r   flattenr   r   rY   r  r  r=   rc   r  r  r  )rw   r8   r  r   r   r   r   quantr   
post_quantvideos              r*   decodezEmu3VQVAE.decode  s_    %* 	7)33A66M.;.A+
Hfe''(=(=(?(?@@;r?

:xIIQQRSUVXY[\^_``kkmm))%00
aAq!,,''1aA66
Z//t{==K$T..D--
 
 '1uQQQT{{E1r,   )r   r   r   r   __annotations__base_model_prefixmain_input_name_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr  rj   r%   r   r  r  r   r   s   @r*   r  r    s          $$ON"&  ? ? ?*      *5< el    82EL 2 2 2 2 2 2 2 2r,   r  c                       e Zd ZdZd Zed             Zed             Zed             Zed             Z	ed             Z
ed             Zd	eej                 d
ej        fdZd	ej        d
ej        fdZdS )Emu3ImageVocabularyMappingzM
    A class for mapping discrete image tokens from VQGAN to BPE tokens.
    c                 |    || _         |                    d          | _        |                    d          | _        d S )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)rw   r  s     r*   rj   z#Emu3ImageVocabularyMapping.__init__  s7    "%MM/::'mmI66r,   c                 b    t          d | j                                        D                       S )Nc                 B    g | ]\  }}|                     d           |S z<|visual token
startswithr  namevals      r*   r  z;Emu3ImageVocabularyMapping.image_tokens.<locals>.<listcomp>  s.    hhhytSdooVfFgFghshhhr,   sortedr  itemsr   s    r*   r  z'Emu3ImageVocabularyMapping.image_tokens  s-    hhDN,@,@,B,Bhhhiiir,   c                 b    t          d | j                                        D                       S )Nc                 B    g | ]\  }}|                     d           |S r  r  r  s      r*   r  z?Emu3ImageVocabularyMapping.image_tokens_str.<locals>.<listcomp>  s.    iii	ctWgGhGhitiiir,   r  r   s    r*   image_tokens_strz+Emu3ImageVocabularyMapping.image_tokens_str  s-    iiT^-A-A-C-Ciiijjjr,   c                 *      fd j         D             S )Nc                 V    i | ]%}t          |d d                   j        |         &S )irM   )r   r  )r  tokenrw   s     r*   
<dictcomp>z6Emu3ImageVocabularyMapping.img2bpe.<locals>.<dictcomp>"  s2    \\\UE"R%L!!4>%#8\\\r,   )r  r   s   `r*   img2bpez"Emu3ImageVocabularyMapping.img2bpe   s     \\\\dF[\\\\r,   c                 H    d | j                                         D             S )Nc                     i | ]\  }}||	S r   r   )r  r0   vs      r*   r  z6Emu3ImageVocabularyMapping.bpe2img.<locals>.<dictcomp>&  s    666A1666r,   )r  r  r   s    r*   bpe2imgz"Emu3ImageVocabularyMapping.bpe2img$  s$    66!3!3!5!56666r,   c                     t          j        t          | j                                                  dz   t           j                  }| j                                        D ]
\  }}|||<   |S Nr   rN   )r%   zerosmaxr  rJ  r   r  rw   mappingr0   r  s       r*   bpe2img_mapping_tensorz1Emu3ImageVocabularyMapping.bpe2img_mapping_tensor(  d    +c$,"3"3"5"566:%)LLLL&&(( 	 	DAqGAJJr,   c                     t          j        t          | j                                                  dz   t           j                  }| j                                        D ]
\  }}|||<   |S r  )r%   r  r  r  rJ  r   r  r  s       r*   img2bpe_mapping_tensorz1Emu3ImageVocabularyMapping.img2bpe_mapping_tensor/  r   r,   	img_batchr:   c                    |j         }t          j        |j        d         dft          j                  | j        z  }| j        |                    d                   }t          j        ||gd          }|                    |          S )Nr   r   r  cpur    r"   )	devicer%   r   r$   r   r  r  rX   r&   )rw   r  r  eol_row
img_tokenss        r*   convert_img2bpez*Emu3ImageVocabularyMapping.convert_img2bpe6  sx    !*ioa0!4EIFFFIZZ0e1D1DE
Y
G4"===
}}V$$$r,   c                     |j         }|dd df         }| j        |                    d                   }|                    |          S )N.r    r  )r  r  rX   )rw   r  r  r  s       r*   convert_bpe2imgz*Emu3ImageVocabularyMapping.convert_bpe2img=  sG    !c3B3h'	0e1D1DE
}}V$$$r,   N)r   r   r   r   rj   r   r  r  r  r  r  r  listr%   r   r	  r  r   r,   r*   r  r    s)        7 7 7
 j j _j k k _k ] ] _] 7 7 _7   _   _%el); % % % % %% %%, % % % % % %r,   r  c                   H    e Zd ZU eed<   dZdZdgZddgZdZ	dZ
dZdZdZdZdS )	Emu3PreTrainedModelrc   modelTr   rz   r]   FN)r   r   r   r   r  r  supports_gradient_checkpointingr  _skip_keys_device_placementr  r  _can_compile_fullgraph!_supports_param_buffer_assignmentr  r  r   r,   r*   r  r  D  se         &*# $5m"DN!(-%"&r,   r  c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )Emu3RotaryEmbeddinginv_freqNrc   c                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultr  F)
persistent)ri   rj   hasattrr  r  dictr  r  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrc   r   rope_init_fnattention_scalingregister_bufferr  original_inv_freq)rw   rc   r  r  rx   s       r*   rj   zEmu3RotaryEmbedding.__init__Y  s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r,   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   r    r   mpsr  F)device_typeenabledr!   r"   r  )r  floatr<   r$   rX   r  r  r  strr%   autocastrS   r&   r1   r#  r2   rN   )
rw   r'   r3   inv_freq_expandedposition_ids_expandedr(  freqsembr1   r2   s
             r*   r   zEmu3RotaryEmbedding.forwardj  s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/r   )r   r   r   r%   r   r  r   rj   no_gradr   r   r   r   s   @r*   r  r  V  s         l/ /z / / / / / /" U]__< <  _< < < < <r,   r  c                       e Zd ZeedZdef fdZee		 	 	 	 	 	 	 dde
ej                 de
ej                 de
ej                 de
e         d	e
ej                 d
e
ej                 de
e         dee         defd                        Z xZS )Emu3TextModel)r8   
attentionsrc   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        t!                    | _        d| _        |                                  d S )Nc                 0    g | ]}t          |          S r   )r   )r  rd   rc   s     r*   r  z*Emu3TextModel.__init__.<locals>.<listcomp>  s$    bbbYfi00bbbr,   r   rc   F)ri   rj   pad_token_idr  
vocab_sizerT   r   rl   embed_tokensrg  ri  num_hidden_layerslayersr   r   normr  
rotary_embgradient_checkpointingr  r   s    `r*   rj   zEmu3TextModel.__init__  s       !. +L):F<NPTP`aambbbb%H`BaBabbb
 
   28KLLL	-V<<<&+# 	r,   N	input_idsrH   r3   rz   inputs_embedsr   r   rK   r:   c           
      N   |d u |d uz  rt          d          ||                     |          }|r|t          | j                  }|B||                                nd}	t          j        |	|	|j        d         z   |j                  }||	                    d          }t          | j        |||||          }
|}|                     ||          }| j        d | j        j                 D ]} ||f|
||||d|}|                     |          }t          ||          S )	Nz:You must specify exactly one of input_ids or inputs_embedsr7  r   r   )r  )rc   input_embedsrH   r   rz   r3   )rH   r3   rz   r   r   )last_hidden_staterz   )rD  r:  r
   rc   get_seq_lengthr%   aranger$   r  r.   r   r>  r<  r;  r=  r   )rw   r@  rH   r3   rz   rA  r   r   rK   past_seen_tokensr]   r8   r   decoder_layers                 r*   r   zEmu3TextModel.forward  s    -t";< 	[YZZZ *.*;*;I*F*FM 	?0*$+>>>O!CRC^==???de+0< "2]5H5K"KTaTh, , ,N )33A66L(;&))+%
 
 
 &"oom\JJ![)H4;+H)HI 		 		M)M*) /-$7   MM 		-00&++
 
 
 	
r,   )NNNNNNN)r   r   r   r   ra   _can_record_outputsr   rj   r   r   r   r%   r   r   r	   r[  r   r   r   r   r   r   r   s   @r*   r3  r3  z  s-        *# 
z         151537+/5959$(8
 8
E,-8
 !.8
 u/0	8

 "%8
   128
 !!128
 D>8
 +,8
 
!8
 8
 8
 ^ 8
 8
 8
 8
 8
r,   r3  c                   r    e Zd ZU dgZddiZddgdgfiZeed<    fdZe	e
	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 dee         deej                 deej                 dee         deej                 deeej        f         dee         defd                        Z xZS )Emu3ForCausalLMlm_head.weightlm_headcolwise_repr8   logitsrc   c                     t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        | 	                                 d S NFrg   )
ri   rj   r3  r  r9  rT   rp   rl   rM  r  r   s     r*   rj   zEmu3ForCausalLM.__init__  sj       "6**
 +y!3V5FUSSS 	r,   Nr   r@  rH   r3   rz   rA  labelsr   r   logits_to_keeprK   r:   c
                 R    | j         d|||||||d|
}|j        }t          |	t                    rt	          |	 d          n|	}|                     |dd|ddf                   }d}| | j        d||| j        j        d|
}t          |||j
        |j        |j                  S )a  
        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```r@  rH   r3   rz   rA  r   r   NrO  rR  r9  lossrO  rz   r8   r4  r   )r  rD  r  r   slicerM  loss_functionrc   r9  r   rz   r8   r4  )rw   r@  rH   r3   rz   rA  rR  r   r   rS  rK   outputsr8   slice_indicesrO  rX  s                   r*   r   zEmu3ForCausalLM.forward  s    @ ,64: 	,
)%+')	,
 	,
 	,
 	,
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4%pVFt{OeppioppD%#3!/)
 
 
 	
r,   )	NNNNNNNNr   )r   r   r   _tied_weights_keys_tp_plan_pp_planr   r  rj   r   r   r   r%   r   r   r	   r[  r   r   r   r   r   r   r   r   r   s   @r*   rK  rK    so        *+=)H_-z:;H      151537+/59-1$(59348
 8
E,-8
 !.8
 u/0	8

 "%8
   128
 )*8
 D>8
 !!128
 c5</08
 +,8
 
 8
 8
 8
 ^ 8
 8
 8
 8
 8
r,   rK  c                   &    e Zd ZddiZ fdZd Zd Zd Zd Zde	j
        d	e	j        fd
Zde	j
        d	e	j        fdZe	j        de	j        dedefd            Zde	j        de	j
        de	j
        fdZee	 	 	 	 	 	 	 	 	 ddee	j                 dee	j
                 d	ee	j                 dee	j                 dee	j                 dee         dee	j
                 dee         dee	j                 dee         deeef         fd                        Z xZS )	Emu3Modelztext_model.model
text_modelc                    t                                          |           t                              |j                  | _        t          |j                  | _        t          |j
                  | _        |                                  d S r   )ri   rj   r3  _from_configtext_configrb  r  	vq_configvqmodelr  vocabulary_mapvocabulary_mappingr  r   s     r*   rj   zEmu3Model.__init__  sp       '44V5GHH !122"<V=R"S"S 	r,   c                 4    | j                                         S r   )rb  get_input_embeddingsr   s    r*   rk  zEmu3Model.get_input_embeddings'  s    33555r,   c                 :    | j                             |           d S r   )rb  set_input_embeddingsrw   rG   s     r*   rm  zEmu3Model.set_input_embeddings*  s    ,,U33333r,   c                     || _         d S r   rb  rw   r  s     r*   set_decoderzEmu3Model.set_decoder-  s    !r,   c                     | j         S r   rp  r   s    r*   get_decoderzEmu3Model.get_decoder0  s
    r,   r  r  c                       j                             ||          } fd|D             }t          j        |          }|S )a  
        Tokenizes images into discrete tokens with VQGAN module. Converts
        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
        special tokens.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
                The sizes of the images in the batch, being (height, width) for each image.
        c                 h    g | ].}j                             |                                          /S r   )ri  r	  r  )r  tokensrw   s     r*   r  z.Emu3Model.get_image_tokens.<locals>.<listcomp>@  s7    uuuY_42BB6JJRRTTuuur,   )rg  r  r%   r&   )rw   r  r  image_tokens_listbpe_tokens_list
bpe_tokenss   `     r*   get_image_tokenszEmu3Model.get_image_tokens3  sL     !L//kJJuuuuctuuuY//
r,   c                                            ||          } fd|D             }                                  |          }t          j        ||          }|S )a7  
        Tokenizes images into discrete tokens with VQGAN module and embeds
        them with text embeddings layer

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
                The tensors corresponding to the input images.
        c                 Z    g | ]'\  }}|j         j        z  |j         j        z  d z   z  (S r_  )rg  r  )r  r   r   rw   s      r*   r  z0Emu3Model.get_image_features.<locals>.<listcomp>N  sL     
 
 
 t|99et|Gi>ilm>mn
 
 
r,   )r{  rk  r%   split)rw   r  r  r  split_sizesimage_featuress   `     r*   get_image_featureszEmu3Model.get_image_featuresD  sv     ,,\;GG
 
 
 
!,
 
 
 52244\BB^[AAr,   r  r   r   c                     |ddddf                              d||dz             }| j                            |          }| j                            |          }|S )a  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.

        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
            height (`int`):
                Height of the generated image before upsampling.
            width (`int`):
                Width of the generated image before upsampling.
        Nr    r   )r   ri  r  rg  r  )rw   r  r   r   	sequencesimages         r*   decode_image_tokenszEmu3Model.decode_image_tokensV  s`     !CRC(--b&%!)DD	.>>yII##L11r,   r@  rA  r  c                 \   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                                }|	                    d          
                    |                              |j                  }|j        d         |j        d         z  }||                                         |                                k    rt          d| d|           |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)rN   r  r    r   r   z6Image features and image tokens do not match: tokens: z, features )rk  r%   tensorri  r  longr  allr   r.   	expand_asrX   r$   numelrD  )rw   r@  rA  r  special_image_maskn_image_tokensn_image_featuress          r*   get_placeholder_maskzEmu3Model.get_placeholder_maski  s1    !.2M$2K2K2M2MT4C5:^k^rsss3 3 " "4!7!7!;!;!*d.E.T!T+//11/99"==GGVVYYZgZnoo)/2^5I!5LL+,22448L8L8N8NNNvvvdtvv   "!r,   NrH   r3   rz   r   r   rK   r:   c
           
      T   |du |duz  rt          d          | |                                 |          }|Z|                     ||          }t          j        |d          }|                     |||          }|                    ||          } | j        d||||||	d|
}|S )ap  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   r"   )rA  r  )rH   r3   rz   rA  r   r   r   )rD  rk  r  r%   r&   r  masked_scatterrb  )rw   r@  r  r  rH   r3   rz   rA  r   r   rK   image_embedsr  r[  s                 r*   r   zEmu3Model.forward  s    * -t";< 	s    7D5577	BBM#22<MML 9\q999L!%!:!:| "; " " *889K\ZZM "$/ 
)%+')
 
 
 
 r,   )	NNNNNNNNN)r   r   r   _checkpoint_conversion_mappingrj   rk  rm  rr  rt  r%   r[  r   r{  r  r1  r   r  r  r   r   r   r   r	   r   r   r   r   r   r   r   r   r   s   @r*   ra  ra    sG       &8,%G"    6 6 64 4 4" " "  U-> UM]    "u/@ uO_    $ ]0@ # VY    ]$")":?:K"]b]n" " " "0  1548.21537+/59$(59. .E,-. u01. el+	.
 !.. u/0. "%.   12. D>. !!12. +,. 
u,,	-. . . ^ . . . . .r,   ra  c                   6    e Zd ZdZdgZddddZ fdZd Zd	 Zd
e	j
        fdZd Zd Zed             Zed             Zed             Zd Zee	 	 	 	 	 	 	 	 	 	 	 d#deej                 deej                 deej                 deej                 deej                 dee         deej                 dee         deej                 deej                 deeej        f         dee         d
ee e!f         fd                         Z"	 	 	 	 	 	 	 d$ fd"	Z# xZ$S )%Emu3ForConditionalGeneration rL  zmodel.text_modelzmodel.vqmodelrM  )z^text_model.modelz^vqmodelz^text_model.lm_headc                     t                                          |           t          |          | _        t	          j        |j        j        |j        j        d          | _	        | 
                                 d S rQ  )ri   rj   ra  r  rT   rp   re  rl   r9  rM  r  r   s     r*   rj   z%Emu3ForConditionalGeneration.__init__  se       v&&
y!3!?ASA^ejkkkr,   c                 4    | j                                         S r   )r  rk  r   s    r*   rk  z1Emu3ForConditionalGeneration.get_input_embeddings  s    z..000r,   c                 :    | j                             |           d S r   )r  rm  rn  s     r*   rm  z1Emu3ForConditionalGeneration.set_input_embeddings  s    
''.....r,   r:   c                     | j         S r   )rM  r   s    r*   get_output_embeddingsz2Emu3ForConditionalGeneration.get_output_embeddings  s
    |r,   c                 :    | j                             |           d S r   )r  rr  rq  s     r*   rr  z(Emu3ForConditionalGeneration.set_decoder  s    
w'''''r,   c                 4    | j                                         S r   )r  rt  r   s    r*   rt  z(Emu3ForConditionalGeneration.get_decoder  s    z%%'''r,   c                     | j         j        S r   )r  rb  r   s    r*   rb  z'Emu3ForConditionalGeneration.text_model  s    z$$r,   c                     | j         j        S r   )r  rg  r   s    r*   rg  z$Emu3ForConditionalGeneration.vqmodel  s    z!!r,   c                     | j         j        S r   )r  ri  r   s    r*   ri  z/Emu3ForConditionalGeneration.vocabulary_mapping  s    z,,r,   c                 &     | j         j        di |S r>  )r  r  )rw   rK   s     r*   r  z0Emu3ForConditionalGeneration.decode_image_tokens  s    -tz-77777r,   Nr   r@  r  r  rH   r3   rz   rA  r   r   rR  rS  rK   c                 ^    | j         d|||||||	d|}|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}|
  | j        d||
| j        j        j        d|}t          |||j
        |j        |j                  S )an  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> conversation = [
        ...     {
        ...     "role": "system",
        ...     "content": [
        ...         {"type": "text", "text": "You are a helpful assistant."},
        ...         ],
        ...     },
        ...     {
        ...     "role": "user",
        ...     "content": [
        ...         {"type": "image"},
        ...         {"type": "text", "text": "Please describe the image."},
        ...         ],
        ...     },
        ... ]

        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        >>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```rU  r   NrV  rW  r   )r  r  r   rY  rM  rZ  rc   re  r9  r   rz   r8   r4  )rw   r@  r  r  rH   r3   rz   rA  r   r   rR  rS  rK   r[  r8   r\  rO  rX  s                     r*   r   z$Emu3ForConditionalGeneration.forward  s	   | $* 	
)%+')	
 	
 	
 	
  
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4% f9P9[ _e D &#3!/)
 
 
 	
r,   Tc	                 n     t                      j        |f|||||||d|	}
|d         dk    rd |
d<   |
S )N)rz   rH   rA  r   r3   r  r   r   r  )ri   prepare_inputs_for_generation)rw   r@  rz   rH   rA  r   r3   r   r  rK   model_inputsrx   s              r*   r  z:Emu3ForConditionalGeneration.prepare_inputs_for_generation?  sk     =uww<

+)')%%

 

 

 

 !!!+/L(r,   )NNNNNNNNNNr   )NNNNNTN)%r   r   r   r  r]  r  rj   rk  rm  rT   rl  r  rr  rt  propertyrb  rg  ri  r  r   r   r   r%   r   r[  r   r	   r   r   r   r   r   r   r   r   r  r   r   s   @r*   r  r    s       *+/#(& &"    1 1 1/ / /ry    ( ( (( ( ( % % X% " " X" - - X-8 8 8  1548.21537+/59$(59-134X
 X
E,-X
 u01X
 el+	X

 !.X
 u/0X
 "%X
   12X
 D>X
 !!12X
 )*X
 c5</0X
 +,X
 
u,,	-X
 X
 X
 ^ X
z          r,   r  )r  rK  r3  r  r  ra  )Nr   )rC   )Wr  	functoolsr   typingr   r   r   r%   torch.nnrT   torch.nn.functionalrU   r   activationsr   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   configuration_emu3r   r   r   r+   r7   r   r   rB   rl  r*  r_   ra   r   r   r   r   r   r   r   r  r!  r+  r/  r:  rA  r  rM  rS  r]  rz  r  r  r  r  r  r  r3  rK  ra  r  __all__r   r,   r*   <module>r     s  .  % % % % % % , , , , , , , , , ,                 ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) 7 7 7 7 7 7 / / / / / / 9 9 9 9 9 9 O O O O O O O O K K K K K K K K F F F F F F F F & & & & & & I I I I I I I I I I 0 0 0 0 0 0 / / / / / / K K K K K K K K K K( ( (   6	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %4D) D) D) D) D)BI D) D) D)N Y''J J J J J") J J ('J(    bi    + + + + +1 + + +\$ $ $ $ $ry $ $ $D	 	 	 	 	RY 	 	 	    29       bi   :! ! ! ! !29 ! ! !H    	   .    ")   &.( .( .( .( .(29 .( .( .(b<( <( <( <( <(29 <( <( <(~>) >) >) >) >)bi >) >) >)BV V V V V V V V    29   D8 8 8 8 8 8 8 8v7 7 7 7 7ry 7 7 7tC C C C Cry C C CLC C C C Cry C C CL   l2 l2 l2 l2 l2 l2 l2 l2^3% 3% 3% 3% 3% 3% 3% 3%l ' ' ' ' '/ ' ' '"!< !< !< !< !<") !< !< !<H P
 P
 P
 P
 P
' P
 P
 P
f I
 I
 I
 I
 I
)? I
 I
 I
XV V V V V# V V Vrh h h h h#6 h h hV  r,   