
     `i                        d dl Z d dlmZ d dlmZmZ d dlZd dlmZ d dl	mc m
Z ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*  ej+        e,          Z- G d de           Z. G d de!          Z/ G d dej0                  Z1 G d de          Z2 G d dej0                  Z3 G d dej0                  Z4 G d dej0                  Z5 G d  d!ej0                  Z6 G d" d#ej0                  Z7 G d$ d%ej0                  Z8 G d& d'ej0                  Z9 G d( d)e&          Z: G d* d+ej;                  Z< G d, d-ej0                  Z= G d. d/ej0                  Z> G d0 d1ej0                  Z? G d2 d3ej0                  Z@ G d4 d5ej0                  ZA ed67           G d8 d9e                      ZB G d: d;          ZC G d< d=eeB          ZD G d> d?e#eD          ZE G d@ dAe"eDe          ZF G dB dCeD          ZG G dD dEeDe          ZHg dFZIdS )G    N)cached_property)OptionalUnion   )Cache)GenerationMixin)CausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging)deprecate_kwarg   )ChameleonPreTrainedModel#ChameleonVQVAEEncoderConvDownsample)LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelTransformersKwargs)SiglipAttention   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                       e Zd ZdS )Emu3AttentionN__name__
__module____qualname__     y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/emu3/modular_emu3.pyr   r   ,           Dr$   r   c                   4    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	ej
                 de	eej        ej        f                  dee         dej        fd            Z xZS )Emu3DecoderLayerconfig	layer_idxc                     t                                          ||           t          j        |j                  | _        d S N)super__init__nnDropoutattention_dropoutdropout)selfr)   r*   	__class__s      r%   r.   zEmu3DecoderLayer.__init__2   s5    +++z&":;;r$   past_key_valuepast_key_valuesz4.58)new_nameversionNFhidden_statesattention_maskposition_ids	use_cachecache_positionposition_embeddingskwargsreturnc                    |}	|                      |          } | j        d|||||||d|\  }}
|	|                     |          z   }|}	|                     |          }|                     |          }|	|                     |          z   }|S )N)r9   r:   r;   r6   r<   r=   r>   r#   )input_layernorm	self_attnr2   post_attention_layernormmlp)r3   r9   r:   r;   r6   r<   r=   r>   r?   residual_s              r%   forwardzEmu3DecoderLayer.forward6   s     !,,];;)4> 	
')%+) 3	
 	
 	
 	
q !4<<#>#>> 55mDD// 4<<#>#>>r$   )NNNFNN)r    r!   r"   r   intr.   r   torchTensorr   
LongTensorr   booltupler   r   rH   __classcell__r4   s   @r%   r(   r(   1   s-       <z <c < < < < < < _%0A6RRR 2637+/$)59KO | !. u/0	
 "% D> !!12 &eEL%,,F&GH +, 
   SR    r$   r(   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )Emu3VQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r)   c                     t                                                       t          j        |j        |j                  | _        | j        j        j        	                    d|j        z  d|j        z             d S )Ng            ?)
r-   r.   r/   	Embeddingcodebook_size	embed_dim	embeddingweightdatauniform_r3   r)   r4   s     r%   r.   z!Emu3VQVAEVectorQuantizer.__init__c   sf    f&:F<LMM"++D63G,GvOcIcdddddr$   hidden_statec                    |j         \  }}}}}|                    ddddd                                          }|                    d|          }t	          j        |dz  dd          }t	          j        | j        j        dz  d	          }	dt	          j        || j        j        	                    dd                    z  }
||	z   |
z
  }
t	          j
        |
d	          }|                    ||||          }|S )
Nr   r   r      r   T)dimkeepdimra   )shapepermute
contiguousviewrJ   sumrX   rY   matmul	transposeargmin)r3   r]   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r%   rH   z Emu3VQVAEVectorQuantizer.forwardh   s   8D8J5
Hh#++Aq!Q::EEGG!-!2!22x!@!@ !9%;Q%>AtTTT	$."7":BBB %;T^=R=\=\]^`a=b=bccc	$}4y@	$|I1===388XvW\]]##r$   )
r    r!   r"   __doc__r   r.   rJ   rK   rH   rO   rP   s   @r%   rR   rR   X   sr         e e e e e e e
$EL $ $ $ $ $ $ $ $r$   rR   c                       e Zd ZdS )Emu3VQVAEEncoderConvDownsampleNr   r#   r$   r%   rx   rx   z   r&   r$   rx   c                   $     e Zd Z fdZd Z xZS )Emu3VQVAEEncoderConvUpsamplec                     t                                                       t          j        ||ddd          | _        d S )Nr   r   kernel_sizestridepadding)r-   r.   r/   Conv2dconv)r3   in_channelsr4   s     r%   r.   z%Emu3VQVAEEncoderConvUpsample.__init__   s:    Ik;AaYZ[[[			r$   c                 ^    t          j        |dd          }|                     |          }|S )N       @nearestscale_factormode)Finterpolater   r3   r9   s     r%   rH   z$Emu3VQVAEEncoderConvUpsample.forward   s/    m#IVVV		-00r$   r    r!   r"   r.   rH   rO   rP   s   @r%   rz   rz   ~   sL        \ \ \ \ \      r$   rz   c            	       ^     e Zd Zdededee         dee         f fdZdej        fdZ xZ	S )Emu3VQVAEConv3d
in_channelout_channelr}   r~   c                 \   t                                                       d t          |dd          |dd                    D             }d| _        |d d d         D ] }| xj        |dz  |dz  z   |dz  fz  c_        !| xj        dz  c_        t	          j        ||||          | _        d S )Nc                     g | ]
\  }}||z
  S r#   r#   ).0
one_kernel
one_strides      r%   
<listcomp>z,Emu3VQVAEConv3d.__init__.<locals>.<listcomp>   s"    ppp5KZj0pppr$   r   r#   r`   r   )r   r   )r~   )r-   r.   zipr   r/   Conv3dr   )r3   r   r   r}   r~   padding_sizespad_sizer4   s          r%   r.   zEmu3VQVAEConv3d.__init__   s     	ppsS^_`_a_aSbdjklkmkmdnOoOoppp%ddd+ 	J 	JHLLX]X\98q=IILLLI	
 
 
			r$   r9   c                 d    t          j        || j                  }|                     |          }|S r,   )r   padr   r   r   s     r%   rH   zEmu3VQVAEConv3d.forward   s,    mT\::		-00r$   )
r    r!   r"   rI   rN   r.   rJ   rK   rH   rO   rP   s   @r%   r   r      s        

 
 3Z	

 c

 
 
 
 
 
,U\        r$   r   c                   L     e Zd Zdedef fdZdej        dej        fdZ xZS )Emu3VQVAESpatialNormr   out_channelsc                     t                                                       t          j        |ddd          | _        t          j        ||ddd          | _        t          j        ||ddd          | _        d S )N    ư>Tnum_channels
num_groupsepsaffiner   r   r|   )r-   r.   r/   	GroupNorm
norm_layerr   conv_yconv_br3   r   r   r4   s      r%   r.   zEmu3VQVAESpatialNorm.__init__   s    
 	,%	
 
 
 i
 
 
 i
 
 
r$   r9   quant_statesc                     t          j        ||j        dd          d          }|                     |          }||                     |          z  |                     |          z   }|S )Nr   )sizer   )r   r   rd   r   r   r   )r3   r9   r   s      r%   rH   zEmu3VQVAESpatialNorm.forward   sd    }\8KBCC8PW`aaa66%L(A(AADKKP\D]D]]r$   	r    r!   r"   rI   r.   rJ   rK   rH   rO   rP   s   @r%   r   r      su        

 
 
 
 
 
 
8U\         r$   r   c                   >     e Zd Zdedef fdZdej        fdZ xZS )Emu3VQVAETemporalUpsampler   r   c                 x    t                                                       t          ||dd          | _        d S )Nr   r   r   r   r   r   r}   r~   r-   r.   r   r   r3   r   r   r4   s      r%   r.   z"Emu3VQVAETemporalUpsample.__init__   A    
 	#!	
 
 
			r$   r9   c                 |   |j         \  }}}}}|                    ddddd                                                              |d|          }t	          j        |dd	          }|                    ||||d                              ddddd                                          }|                     |          }|S )
Nr   r   r   r_   r   r`   r   r   r   )rd   re   rf   rg   r   r   r   )r3   r9   rl   rn   rm   ro   rp   s          r%   rH   z!Emu3VQVAETemporalUpsample.forward   s    8E8K5
Hh%--aAq!<<GGIINNz[]_ghhm#IVVV%**:xPRSS[[\]_`bcefhijjuuww		-00r$   r   rP   s   @r%   r   r      sl        

 
 
 
 
 
 
U\        r$   r   c                   >     e Zd Zdedef fdZdej        fdZ xZS )Emu3VQVAETemporalDownsampler   r   c                 x    t                                                       t          ||dd          | _        d S )N)r_   r   r   )r   r   r   r   r   r   s      r%   r.   z$Emu3VQVAETemporalDownsample.__init__   r   r$   r9   c                 0    |                      |          }|S r,   )r   r   s     r%   rH   z#Emu3VQVAETemporalDownsample.forward   s    		-00r$   r   rP   s   @r%   r   r      sl        

 
 
 
 
 
 
U\        r$   r   c                   (     e Zd Z	 d fd	Zd Z xZS )Emu3VQVAETemporalResnetBlockNc                    t                                                       || _        ||n|| _        t	          j        |          | _        t          ||dd          | _        t	          j        |          | _	        t          ||dd          | _
        | j        | j        k    r t	          j        ||ddd          | _        d S d S )Nr   r   r   r   r   r|   )r-   r.   r   r   r/   BatchNorm3dnorm1r   conv1norm2conv2r   nin_shortcutr   s      r%   r.   z%Emu3VQVAETemporalResnetBlock.__init__   s    
 	&+7+?KK\^K00
$!	
 
 

 ^L11
$!	
 
 

 t000 "	! ! !D 10r$   c                 ^   |}|                      |          }|t          j        |          z  }|                     |          }|                     |          }|t          j        |          z  }|                     |          }| j        | j        k    r|                     |          }||z   S r,   )	r   rJ   sigmoidr   r   r   r   r   r   )r3   r9   rF   s      r%   rH   z$Emu3VQVAETemporalResnetBlock.forward  s     

=11}555

=11

=11}555

=11t000((22H-''r$   r,   r   rP   s   @r%   r   r      sR              @( ( ( ( ( ( (r$   r   c                   |     e Zd Z	 	 ddedee         dee         f fdZd	dej        deej                 fdZ xZ	S )
Emu3VQVAEResnetBlockNr   r   quant_channelsc                 $   t                                                       || _        ||n|}|| _        || _        |;t          j        |ddd          | _        t          j        |ddd          | _        n*t          ||          | _        t          ||          | _        t          j
        ||ddd          | _        t          j
        ||ddd          | _        | j        | j        k    r t          j
        ||ddd          | _        d S d S )	Nr   r   Tr   r   r   r|   r   )r-   r.   r   r   r   r/   r   r   r   r   r   r   r   r   )r3   r   r   r   r4   s       r%   r.   zEmu3VQVAEResnetBlock.__init__&  s<    	&&2&:{{(,!;2SW`deeeDJ<BTXaefffDJJ-nkJJDJ-nlKKDJY
 
 

 Y
 
 

 t000 "	! ! !D 10r$   r9   c                 Z   | j         dn|f}|} | j        |g|R  }|t          j        |          z  }|                     |          } | j        |g|R  }|t          j        |          z  }|                     |          }| j        | j        k    r| 	                    |          }||z   S Nr#   )
r   r   rJ   r   r   r   r   r   r   r   )r3   r9   r   	norm_argsrF   s        r%   rH   zEmu3VQVAEResnetBlock.forwardR  s    -5BBN;L	 "
==9===}555

=11"
==9===}555

=11t000((22H-''r$   )NNr,   )
r    r!   r"   rI   r   r.   rJ   rK   rH   rO   rP   s   @r%   r   r   %  s         '+(,	* ** sm* !	* * * * * *X( (U\ (8ELCY ( ( ( ( ( ( ( (r$   r   c                   $     e Zd Zdef fdZ xZS )Emu3VQVAEAttentionBlockr)   c                 X    t                                          |           d| _        d S )Nr   )r-   r.   num_key_value_groupsr\   s     r%   r.   z Emu3VQVAEAttentionBlock.__init__e  s+        %&!!!r$   )r    r!   r"   r   r.   rO   rP   s   @r%   r   r   d  sD        & & & & & & & & & & &r$   r   c                   *     e Zd ZdZ fdZddZ xZS )Emu3VQVAEGroupNormz
    Same as the torch GroupNorm with the only difference that this ones accepts
    an optional kwarg `quant_states` which is not used. This class makes it easier to
    use SpatialNorm or GroupNorm without conditionals
    c                 :     t                      j        di | d S r   )r-   r.   )r3   r?   r4   s     r%   r.   zEmu3VQVAEGroupNorm.__init__s  s&    ""6"""""r$   Nc                 Z    t          j        || j        | j        | j        | j                  S r,   )r   
group_normr   rY   biasr   )r3   inputr   s      r%   rH   zEmu3VQVAEGroupNorm.forwardv  s"    |E4?DKDHUUUr$   r,   )r    r!   r"   rv   r.   rH   rO   rP   s   @r%   r   r   l  s^         # # # # #V V V V V V V Vr$   r   c                   R     e Zd Zd fd	Zddej        deej                 fdZ xZS )Emu3VQVAEMiddleBlockNc                 ,   t                                                       t          |||          | _        t	          |          | _        |t          |ddd          | _        nt          ||          | _        t          |||          | _	        d S )Nr   r   r   r   r   Tr   )
r-   r.   r   block_1r   attn_1r   	attn_normr   block_2)r3   r)   r   r   r4   s       r%   r.   zEmu3VQVAEMiddleBlock.__init__{  s    +#$)
 
 

 .f55!/[UW]ajnoooDNN1.+NNDN+#$)
 
 
r$   r9   r   c                    |                      ||          }|}|                     ||          }|j        \  }}}}|                    ||||z                                dd          }|                     |          d         }|                    ||||                              dddd          }||z   }|                     ||          }|S )Nr   r   r   r   )	r   r   rd   rg   rj   r   reshapere   r   )r3   r9   r   rF   rl   rn   ro   rp   s           r%   rH   zEmu3VQVAEMiddleBlock.forward  s    ]LAA }lCC.;.A+
Hfe%**:x%PPZZ[\^_``M2215%--j&%RRZZ[\^_abdeff =0]LAAr$   r,   )	r    r!   r"   r.   rJ   FloatTensorr   rH   rO   rP   s   @r%   r   r   z  so        
 
 
 
 
 
(
 
U%6 
huO`Fa 
 
 
 
 
 
 
 
r$   r   c                   4     e Zd Z fdZdej        fdZ xZS )Emu3VQVAEDownBlockc           
         t                                                       t          |j                  | _        |j        | _        |j        }|j        }dt          |          z   }|| _        t          j
                    | _        t          | j                  D ]P}t          j
                    }t          j
                    }t          j
                    }|||         z  }	|||         z  }
t          | j                  D ]}|                    t          |	|
                     |
}	|j        V||j        v rM|                    t!          |                     |                    t          j        |	ddd                     t          j                    }||_        ||_        ||_        || j        dz
  k    rt-          |	          |_        | j                            |           Rd S )Nr   r   r   r   r   Tr   r   )r-   r.   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrN   in_channel_multiplierr/   
ModuleListdownrangeappendr   attn_resolutionsr   r   Moduleblockattn
attn_normsrx   
downsample)r3   r)   r   r   r   i_levelr   r   r   block_in	block_outi_blockr   r4   s                r%   r.   zEmu3VQVAEDownBlock.__init__  s   "6#<==$3,#6 $u-?'@'@ @%:"MOO	T122 	# 	#GMOOE=??DJ$'<W'EEH%(:7(CCI !455 
q 
q($,%.     %*67fF];];]KK 7 ? ?@@@%%blUW]ajn&o&o&oppp9;;DDJDI(DO$.222"@"J"JIT""""1	# 	#r$   r9   c                 P   t          | j                  D ]\  }}t          | j                  D ]} |j        |         |          }t          |j                  dk    r|} |j        |         |          }|j        \  }}}}	|	                    ||||	z            
                    dd          } |j        |         |          d         }|                    |||	|                              dddd          }||z   }|| j        dz
  k    r|                    |          }|S )Nr   r   r   r   )	enumerater   r   r   r   r   r   r   rd   rg   rj   r   re   r   r   )
r3   r9   r   blocksr   rF   rl   rn   ro   rp   s
             r%   rH   zEmu3VQVAEDownBlock.forward  sF   (33 	A 	AOGV !455 = = 5W 5m D Dv{##a'',H$>F$5g$>}$M$MM:G:M7J&%$1$6$6z8VV[^$\$\$f$fghjk$l$lM$8FK$8$G$G$JM$1$9$9*feU]$^$^$f$fghjkmnpq$r$rM$,}$<M$.222 & 1 1- @ @r$   r    r!   r"   r.   rJ   r   rH   rO   rP   s   @r%   r   r     sW        ## ## ## ## ##JU%6        r$   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )Emu3VQVAEUpBlockc           	         t                                                       t          |j                  | _        |j        | _        |j        }|j        |j        d         z  }t          j	                    | _
        t          t          | j                            D ]=}t          j	                    }t          j	                    }t          j	                    }|j        |j        |         z  }t          | j        dz             D ]w}	|                    t          |||                     |}||j        v rE|                    t!          |                     |                    t#          ||                     xt          j                    }
||
_        ||
_        ||
_        |dk    rt-          |          |
_        | j
                            d|
           ?d S )Nr`   r   r   r   )r-   r.   r   r   r   r   rW   r   r/   r   upreversedr   r   r   r   r   r   r   r   r   r   rz   upsampleinsert)r3   r)   r   r   r   r   r   r   r   r   r  r4   s              r%   r.   zEmu3VQVAEUpBlock.__init__  s   "6#<==$3)'&*CB*GG-//d&: ; ;<< 	" 	"GMOOE=??DJ,v/H/QQI !4q!899 V V($,%.'5     %f555KK 7 ? ?@@@%%&:>8&T&TUUUBBHBG&BM!||:8DDGNN1b!!!!3	" 	"r$   r9   r   c                    t          | j        d d d                   D ]!\  }}t          | j        dz             D ]} |j        |         ||          }t          |j                  dk    r|} |j        |         ||          }|j        \  }}}	}
|	                    |||	|
z            
                    dd          } |j        |         |          d         }|                    ||	|
|                              dddd          }||z   }|t          | j                  dz
  k    r|                    |          }#|S )Nr`   r   r   r   r   )r  r  r   r   r   r   r   r   rd   rg   rj   r   re   r
  )r3   r9   r   r   r  r   rF   rl   rn   ro   rp   s              r%   rH   zEmu3VQVAEUpBlock.forward  sZ   (277 	? 	?OGV !4q!899 = = 5W 5m\ R Rv{##a'',H$>F$5g$>}l$[$[M:G:M7J&%$1$6$6z8VV[^$\$\$f$fghjk$l$lM$8FK$8$G$G$JM$1$9$9*feU]$^$^$f$fghjkmnpq$r$rM$,}$<M#dg,,*** & > >r$   r  rP   s   @r%   r  r    sa        #" #" #" #" #"JU%6 eFW        r$   r  c                   4     e Zd Z fdZdej        fdZ xZS )Emu3VQVAEEncoderc                    t                                                       |j        }|j        }|j        }|j        }|j        }|rd|z  n|}||d         z  }t          j        	                    ||ddd          | _
        t          |          | _        t          ||          | _        t          j                            d|dd	          | _        t          j        	                    ||ddd          | _        t%          t'          j        |j                            }	t          j                    | _        t          j                    | _        t3          |	          D ],}
t5          ||          }| j                            |           -t3          |j                  D ]-}t;          ||
          }| j                            |           .d S )Nr   r`   r   r   r|   r   r   T)r   r   r   r   r   )r-   r.   r   r   double_latentlatent_channelsr   rJ   r/   r   conv_inr   
down_blockr   middle_blockr   norm_outconv_outrI   mathlog2temporal_downsample_factorr   	time_convtime_res_stackr   r   r   r   r   )r3   r)   r   r   r  r  r   r   r   temporal_down_blocksir   rG   time_res_convr4   s                 r%   r.   zEmu3VQVAEEncoder.__init__  s   ,(, 0#6.;Pq?** #5b#99x{MqYZdeff,V440BB**bxUYbf*gg ( 
 
  #49V-N#O#OPP moo+,, 	( 	(A.|\JJDN!!$''''v,-- 	6 	6A8()  M &&}5555	6 	6r$   pixel_valuesc                 t   |j         d         } |j        dg|j         dd          R  }|                     |          }|                     |          }|                     |          }|                     |          }|t          j        |          z  }|                     |          } |j        d|g|j         dd          R  }|	                    ddddd          }| j
        D ]$} ||          }|t          j        |          z  }%| j        D ]} ||          }|	                    ddddd          }|S )Nr   r`   r   r   r   r_   )rd   r   r  r  r  r  rJ   r   r  re   r  r  )r3   r  temporal_dimr9   r   layers         r%   rH   zEmu3VQVAEEncoder.forward9  sf   #)!,+|+BH1CABB1GHHH \2266))-88 m44}555m44--b,YATUVUWUWAXYYY%--aAq!<< N 	: 	:D D//MU]=999MM( 	1 	1E!E-00MM%--aAq!<<r$   )r    r!   r"   r.   rJ   rL   rH   rO   rP   s   @r%   r  r    sW        %6 %6 %6 %6 %6NE$4        r$   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )Emu3VQVAEDecoderr)   c                    t                                                       |j        }|j        |j        d         z  }t          j                    | _        t          |j	                  D ]7}t          |j        |j                  }| j                            |           8t          t          j        |j                            }t          j                    | _        t          |          D ]6}t%          |j        |j                  }| j                            |           7t          j        |j        |ddd          | _        t+          |||          | _        t/          |          | _        |j        |j        d         z  }t3          ||          | _        t          j        ||j        ddd          | _        d S )Nr`   r   r   r   r|   )r   r   )r-   r.   rW   r   r   r/   r   r  r   r   r   r  r   rI   r  r  r  r  r   r   r  r   r  r  up_blockr   r  r   r  )
r3   r)   r   r   rG   r  temp_upsample_block_numr  r   r4   s
            r%   r.   zEmu3VQVAEDecoder.__init__X  s   )'&*CB*GG moov,-- 	6 	6A8"2AW  M &&}5555"%di0Q&R&R"S"S.// 	( 	(A,V-CVE[\\DN!!$''''y"
 
 
 1R`aaa(00'&*CA*FF,^XFF	
 
 
r$   r9   r   c                    t          j        ||fd          }|                    ddddd          }| j        D ]} ||          }| j        D ]$} ||          }|t          j        |          z  }%|                    ddddd          }t          j        |dd          \  }} |j        dg|j        dd          R  } |j        dg|j        dd          R  }| 	                    |          }| 
                    ||          }|                     ||          }|                     ||          }|t          j        |          z  }|                     |          }|S )Nr   rc   r   r   r   r_   r`   )rJ   catre   r  r  r   chunkr   rd   r  r  r&  r  r  )r3   r9   r   hidden_quant_statesr"  s        r%   rH   zEmu3VQVAEDecoder.forward  s   #i(E1MMM199!Q1aHH ( 	= 	=E"'%(;"<"<^ 	F 	FE"'%(;"<"<5=1D#E#EE199!Q1aHH&+k2Eqa&P&P&P#|--bK=3Fqrr3JKKK+|+BH1CABB1GHHH]33 ))-FFm\BBm\BB}555m44r$   )	r    r!   r"   r   r.   rJ   rK   rH   rO   rP   s   @r%   r$  r$  W  sk        %
 %
 %
 %
 %
 %
 %
NU\         r$   r$  aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    )custom_introc                        e Zd ZU eed<   dZdZdZdZdZ	dZ
g dZd Zdef fdZdej        dej        fd	Zd
ej        fdZ xZS )	Emu3VQVAEr)   
emuvideovqr  T)r   r   r   rR   c                    t          |t          j        t          j        f          rt          j                            |j        dd           |j        gt          j                            |j                  \  }}dt          j
        |          z  }t          j                            |j        | |           d S d S t          |t          j                  rt          j                            |j        t          j
        d                     |j        ot          j                            |j                  \  }}|dk    rdt          j
        |          z  nd}t          j                            |j        | |           d S d S t          |t          j        t          j        t          j        f          rLt          j                            |j        d           t          j                            |j        d	           d S t          |t          j                  rP|j        j                                         |j        -|j        j        |j                                                  d S d S d S )
Nfan_outrelu)r   nonlinearityr      )ar   rT   g        )
isinstancer/   r   r   initkaiming_normal_rY   r   _calculate_fan_in_and_fan_outr  sqrtr[   Linearkaiming_uniform_BatchNorm2dr   r   	constant_rU   rZ   normal_padding_idxzero_)r3   modulefan_inrG   bounds        r%   _init_weightszEmu3VQVAE._init_weights  s   fry")455 	?G##FM	PV#WWW{&GAA&-PP	DIf---  ufe<<<<< '& 	** 	?G$$V]dill$CCC{&GAA&-PP	17!DIf----  ufe<<<<< '&  NOO 	?GfmS111Gfk3/////-- 	?M&&(((!-"6#56<<>>>>>	? 	?--r$   c                 $   t                                          |           || _        t          |          | _        t          |          | _        t          |          | _        dt          |j
                  dz
  z  | _        t          |j        |j        dd          | _        t          |j        |j        dd          | _        dt          |j
                  dz
  z  | _        |                                  |                                  d S )Nr   r   )r   r   r   r   r   )r-   r.   r)   r  encoderr$  decoderrR   quantizer   r   vision_spatial_factorr   r  rW   
quant_convpost_quant_convspatial_scale_factoreval	post_initr\   s     r%   r.   zEmu3VQVAE.__init__  s       '//'//088%&3v/H+I+IA+M%N")"F$4)T]
 
 
  /f4)T] 
  
  
 %&#f.G*H*H1*L$M!		r$   image_sizesc                     |j         dk    }|rE j        j        }|j        \  }}}}|                    d                              d|ddd          }n|j        \  }}}}}                     |          }	|	                    ddddd          }	                     |	          }	|	                    ddddd          }	 	                    |	          }
|r|

                    d          n|
} fdt          ||          D             }|S )Nr_   r   r   r   r   c           	          g | ]I\  }}|d t          |d         j        z            d t          |d         j        z            f         JS )Nr   r   )rI   rJ  )r   single_imager   r3   s      r%   r   z$Emu3VQVAE.encode.<locals>.<listcomp>  sm     
 
 
"d D3tAw)CCDDDFqDQRGVZVpLpHqHqFqqr
 
 
r$   )ndimr)   r  rd   	unsqueezerepeatrG  re   rK  rI  squeezer   )r3   r  rP  is_imagerm   rl   rn   ro   rp   r9   codesimage_tokenss   `           r%   encodezEmu3VQVAE.encode  s4   $) 	O{=H2>2D/J&%'11!44;;AxAqQQLL<H<N9J(FE\22 &--aAq!<<66 &--aAq!<<m,,+3>u}}Q'''
 
 
 
&),&D&D
 
 

 r$   r9   c                    |j         dk    }|r|                    d          }|j        \  }}}}| j                            |                                          }|j        d         }|                    |||||                              ddddd                                          }| 	                    |          }	|                    ddddd          }|	                    ddddd          }	| 
                    |	|          }
|
                    ||| j        j        z  | j        j        || j        z  || j        z            }
|r|
d d df         n|
S )Nr   r   r`   r   r_   r   )rT  rU  rd   rI  rX   flattenrg   re   rf   rL  rH  r   r)   r  r   rM  )r3   r9   rX  rl   rm   ro   rp   quantrn   
post_quantvideos              r%   decodezEmu3VQVAE.decode  s_    %* 	7)33A66M.;.A+
Hfe''(=(=(?(?@@;r?

:xIIQQRSUVXY[\^_``kkmm))%00
aAq!,,''1aA66
Z//t{==K$T..D--
 
 '1uQQQT{{E1r$   )r    r!   r"   r   __annotations__base_model_prefixmain_input_name_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesrE  r.   rJ   rK   r[  ra  rO   rP   s   @r%   r.  r.    s          $$ON"&  ? ? ?*      *5< el    82EL 2 2 2 2 2 2 2 2r$   r.  c                       e Zd ZdZd Zed             Zed             Zed             Zed             Z	ed             Z
ed             Zd	eej                 d
ej        fdZd	ej        d
ej        fdZdS )Emu3ImageVocabularyMappingzM
    A class for mapping discrete image tokens from VQGAN to BPE tokens.
    c                 |    || _         |                    d          | _        |                    d          | _        d S )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)r3   rm  s     r%   r.   z#Emu3ImageVocabularyMapping.__init__  s7    "%MM/::'mmI66r$   c                 b    t          d | j                                        D                       S )Nc                 B    g | ]\  }}|                     d           |S z<|visual token
startswithr   namevals      r%   r   z;Emu3ImageVocabularyMapping.image_tokens.<locals>.<listcomp>   s.    hhhytSdooVfFgFghshhhr$   sortedrm  itemsr3   s    r%   rZ  z'Emu3ImageVocabularyMapping.image_tokens  s-    hhDN,@,@,B,Bhhhiiir$   c                 b    t          d | j                                        D                       S )Nc                 B    g | ]\  }}|                     d           |S rs  rt  rv  s      r%   r   z?Emu3ImageVocabularyMapping.image_tokens_str.<locals>.<listcomp>$  s.    iii	ctWgGhGhitiiir$   ry  r|  s    r%   image_tokens_strz+Emu3ImageVocabularyMapping.image_tokens_str"  s-    iiT^-A-A-C-Ciiijjjr$   c                 *      fd j         D             S )Nc                 V    i | ]%}t          |d d                   j        |         &S )ir   )rI   rm  )r   tokenr3   s     r%   
<dictcomp>z6Emu3ImageVocabularyMapping.img2bpe.<locals>.<dictcomp>(  s2    \\\UE"R%L!!4>%#8\\\r$   )r  r|  s   `r%   img2bpez"Emu3ImageVocabularyMapping.img2bpe&  s     \\\\dF[\\\\r$   c                 H    d | j                                         D             S )Nc                     i | ]\  }}||	S r#   r#   )r   kvs      r%   r  z6Emu3ImageVocabularyMapping.bpe2img.<locals>.<dictcomp>,  s    666A1666r$   )r  r{  r|  s    r%   bpe2imgz"Emu3ImageVocabularyMapping.bpe2img*  s$    66!3!3!5!56666r$   c                     t          j        t          | j                                                  dz   t           j                  }| j                                        D ]
\  }}|||<   |S Nr   dtype)rJ   zerosmaxr  keysrI   r{  r3   mappingr  r  s       r%   bpe2img_mapping_tensorz1Emu3ImageVocabularyMapping.bpe2img_mapping_tensor.  d    +c$,"3"3"5"566:%)LLLL&&(( 	 	DAqGAJJr$   c                     t          j        t          | j                                                  dz   t           j                  }| j                                        D ]
\  }}|||<   |S r  )rJ   r  r  r  r  rI   r{  r  s       r%   img2bpe_mapping_tensorz1Emu3ImageVocabularyMapping.img2bpe_mapping_tensor5  r  r$   	img_batchr@   c                    |j         }t          j        |j        d         dft          j                  | j        z  }| j        |                    d                   }t          j        ||gd          }|                    |          S )Nr   r   r  cpur`   rc   )	devicerJ   onesrd   rI   ro  r  tor)  )r3   r  r  eol_row
img_tokenss        r%   convert_img2bpez*Emu3ImageVocabularyMapping.convert_img2bpe<  sx    !*ioa0!4EIFFFIZZ0e1D1DE
Y
G4"===
}}V$$$r$   c                     |j         }|dd df         }| j        |                    d                   }|                    |          S )N.r`   r  )r  r  r  )r3   r  r  r  s       r%   convert_bpe2imgz*Emu3ImageVocabularyMapping.convert_bpe2imgC  sG    !c3B3h'	0e1D1DE
}}V$$$r$   N)r    r!   r"   rv   r.   r   rZ  r  r  r  r  r  listrJ   rK   r  r  r#   r$   r%   rk  rk    s)        7 7 7
 j j _j k k _k ] ] _] 7 7 _7   _   _%el); % % % % %% %%, % % % % % %r$   rk  c                       e Zd ZdgZdZdZdS )Emu3PreTrainedModelr(   TN)r    r!   r"   ri  rg  rh  r#   r$   r%   r  r  J  s)         "&r$   r  c                   .     e Zd ZeedZdef fdZ xZS )Emu3TextModel)r9   
attentionsr)   c                     t                                                     t          j        fdt	          j                  D                       | _        d S )Nc                 0    g | ]}t          |          S r#   )r(   )r   r*   r)   s     r%   r   z*Emu3TextModel.__init__.<locals>.<listcomp>[  s$    bbbYfi00bbbr$   )r-   r.   r/   r   r   num_hidden_layerslayersr\   s    `r%   r.   zEmu3TextModel.__init__X  sU       mbbbb%H`BaBabbb
 
r$   )	r    r!   r"   r(   r   _can_record_outputsr   r.   rO   rP   s   @r%   r  r  R  sW        )# 

z 
 
 
 
 
 
 
 
 
 
r$   r  c                   4     e Zd ZU eed<    fdZ fdZ xZS )Emu3ForCausalLMr)   c                 r    t                                          |           t          |          | _        d S r,   )r-   r.   r  modelr\   s     r%   r.   zEmu3ForCausalLM.__init__b  s.       "6**


r$   c                  H    t                                                       dS )a  
        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```N)r-   rH   )super_kwargsr4   s    r%   rH   zEmu3ForCausalLM.forwardf  s    $ 	r$   )r    r!   r"   r   rb  r.   rH   rO   rP   s   @r%   r  r  _  s_         + + + + +        r$   r  c                   &    e Zd ZddiZ fdZd Zd Zd Zd Zde	j
        d	e	j        fd
Zde	j
        d	e	j        fdZe	j        de	j        dedefd            Zde	j        de	j
        de	j
        fdZee	 	 	 	 	 	 	 	 	 ddee	j                 dee	j
                 d	ee	j                 dee	j                 dee	j                 dee         dee	j
                 dee         dee	j                 dee         deeef         fd                        Z xZS )	Emu3Modelztext_model.model
text_modelc                    t                                          |           t                              |j                  | _        t          |j                  | _        t          |j
                  | _        |                                  d S r,   )r-   r.   r  _from_configtext_configr  r.  	vq_configvqmodelrk  vocabulary_mapvocabulary_mappingrO  r\   s     r%   r.   zEmu3Model.__init__~  sp       '44V5GHH !122"<V=R"S"S 	r$   c                 4    | j                                         S r,   )r  get_input_embeddingsr|  s    r%   r  zEmu3Model.get_input_embeddings  s    33555r$   c                 :    | j                             |           d S r,   )r  set_input_embeddingsr3   values     r%   r  zEmu3Model.set_input_embeddings  s    ,,U33333r$   c                     || _         d S r,   r  r3   rH  s     r%   set_decoderzEmu3Model.set_decoder  s    !r$   c                     | j         S r,   r  r|  s    r%   get_decoderzEmu3Model.get_decoder  s
    r$   r  rP  c                       j                             ||          } fd|D             }t          j        |          }|S )a  
        Tokenizes images into discrete tokens with VQGAN module. Converts
        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
        special tokens.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
                The sizes of the images in the batch, being (height, width) for each image.
        c                 h    g | ].}j                             |                                          /S r#   )r  r  r]  )r   tokensr3   s     r%   r   z.Emu3Model.get_image_tokens.<locals>.<listcomp>  s7    uuuY_42BB6JJRRTTuuur$   )r  r[  rJ   r)  )r3   r  rP  image_tokens_listbpe_tokens_list
bpe_tokenss   `     r%   get_image_tokenszEmu3Model.get_image_tokens  sL     !L//kJJuuuuctuuuY//
r$   c                                            ||          } fd|D             }                                  |          }t          j        ||          }|S )a7  
        Tokenizes images into discrete tokens with VQGAN module and embeds
        them with text embeddings layer

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
                The tensors corresponding to the input images.
        c                 Z    g | ]'\  }}|j         j        z  |j         j        z  d z   z  (S r   )r  rJ  )r   ro   rp   r3   s      r%   r   z0Emu3Model.get_image_features.<locals>.<listcomp>  sL     
 
 
 t|99et|Gi>ilm>mn
 
 
r$   )r  r  rJ   split)r3   r  rP  rZ  split_sizesimage_featuress   `     r%   get_image_featureszEmu3Model.get_image_features  sv     ,,\;GG
 
 
 
!,
 
 
 52244\BB^[AAr$   rZ  ro   rp   c                     |ddddf                              d||dz             }| j                            |          }| j                            |          }|S )a  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.

        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
            height (`int`):
                Height of the generated image before upsampling.
            width (`int`):
                Width of the generated image before upsampling.
        Nr`   r   )rg   r  r  r  ra  )r3   rZ  ro   rp   	sequencesimages         r%   decode_image_tokenszEmu3Model.decode_image_tokens  s`     !CRC(--b&%!)DD	.>>yII##L11r$   	input_idsinputs_embedsr  c                 \   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                                }|	                    d          
                    |                              |j                  }|j        d         |j        d         z  }||                                         |                                k    rt          d| d|           |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)r  r  r`   r   r   z6Image features and image tokens do not match: tokens: z, features )r  rJ   tensorr  rp  longr  allrh   rU  	expand_asr  rd   numel
ValueError)r3   r  r  r  special_image_maskn_image_tokensn_image_featuress          r%   get_placeholder_maskzEmu3Model.get_placeholder_mask  s1    !.2M$2K2K2M2MT4C5:^k^rsss3 3 " "4!7!7!;!;!*d.E.T!T+//11/99"==GGVVYYZgZnoo)/2^5I!5LL+,22448L8L8N8NNNvvvdtvv   "!r$   Nr:   r;   r6   r<   r=   r?   r@   c
           
      T   |du |duz  rt          d          | |                                 |          }|Z|                     ||          }t          j        |d          }|                     |||          }|                    ||          } | j        d||||||	d|
}|S )ap  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   rc   )r  r  )r:   r;   r6   r  r<   r=   r#   )r  r  r  rJ   r)  r  masked_scatterr  )r3   r  r  rP  r:   r;   r6   r  r<   r=   r?   image_embedsr  outputss                 r%   rH   zEmu3Model.forward  s    * -t";< 	s    7D5577	BBM#22<MML 9\q999L!%!:!:| "; " " *889K\ZZM "$/ 
)%+')
 
 
 
 r$   )	NNNNNNNNN)r    r!   r"   _checkpoint_conversion_mappingr.   r  r  r  r  rJ   r   rL   r  r  no_gradrI   r  r  r   r   r   rK   r   rM   r   r   r   rN   r	   rH   rO   rP   s   @r%   r  r  {  sG       &8,%G"    6 6 64 4 4" " "  U-> UM]    "u/@ uO_    $ ]0@ # VY    ]$")":?:K"]b]n" " " "0  1548.21537+/59$(59. .E,-. u01. el+	.
 !.. u/0. "%.   12. D>. !!12. +,. 
u,,	-. . . ^ . . . . .r$   r  c                   6    e Zd ZdZdgZddddZ fdZd Zd	 Zd
e	j
        fdZd Zd Zed             Zed             Zed             Zd Zee	 	 	 	 	 	 	 	 	 	 	 d#deej                 deej                 deej                 deej                 deej                 dee         deej                 dee         deej                 deej                 deeej        f         dee         d
ee e!f         fd                         Z"	 	 	 	 	 	 	 d$ fd"	Z# xZ$S )%Emu3ForConditionalGeneration zlm_head.weightzmodel.text_modelzmodel.vqmodellm_head)z^text_model.modelz^vqmodelz^text_model.lm_headc                     t                                          |           t          |          | _        t	          j        |j        j        |j        j        d          | _	        | 
                                 d S )NF)r   )r-   r.   r  r  r/   r;  r  hidden_size
vocab_sizer  rO  r\   s     r%   r.   z%Emu3ForConditionalGeneration.__init__  se       v&&
y!3!?ASA^ejkkkr$   c                 4    | j                                         S r,   )r  r  r|  s    r%   r  z1Emu3ForConditionalGeneration.get_input_embeddings$  s    z..000r$   c                 :    | j                             |           d S r,   )r  r  r  s     r%   r  z1Emu3ForConditionalGeneration.set_input_embeddings'  s    
''.....r$   r@   c                     | j         S r,   )r  r|  s    r%   get_output_embeddingsz2Emu3ForConditionalGeneration.get_output_embeddings*  s
    |r$   c                 :    | j                             |           d S r,   )r  r  r  s     r%   r  z(Emu3ForConditionalGeneration.set_decoder-  s    
w'''''r$   c                 4    | j                                         S r,   )r  r  r|  s    r%   r  z(Emu3ForConditionalGeneration.get_decoder0  s    z%%'''r$   c                     | j         j        S r,   )r  r  r|  s    r%   r  z'Emu3ForConditionalGeneration.text_model4  s    z$$r$   c                     | j         j        S r,   )r  r  r|  s    r%   r  z$Emu3ForConditionalGeneration.vqmodel8  s    z!!r$   c                     | j         j        S r,   )r  r  r|  s    r%   r  z/Emu3ForConditionalGeneration.vocabulary_mapping<  s    z,,r$   c                 &     | j         j        di |S r   )r  r  )r3   r?   s     r%   r  z0Emu3ForConditionalGeneration.decode_image_tokens@  s    -tz-77777r$   Nr   r  r  rP  r:   r;   r6   r  r<   r=   labelslogits_to_keepr?   c                 ^    | j         d|||||||	d|}|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}|
  | j        d||
| j        j        j        d|}t          |||j
        |j        |j                  S )an  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> conversation = [
        ...     {
        ...     "role": "system",
        ...     "content": [
        ...         {"type": "text", "text": "You are a helpful assistant."},
        ...         ],
        ...     },
        ...     {
        ...     "role": "user",
        ...     "content": [
        ...         {"type": "image"},
        ...         {"type": "text", "text": "Please describe the image."},
        ...         ],
        ...     },
        ... ]

        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        >>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```)r  r:   r;   r6   r  r<   r=   r   N)logitsr  r  )lossr  r6   r9   r  r#   )r  r6  rI   slicer  loss_functionr)   r  r  r	   r6   r9   r  )r3   r  r  rP  r:   r;   r6   r  r<   r=   r  r   r?   r  r9   slice_indicesr  r  s                     r%   rH   z$Emu3ForConditionalGeneration.forwardC  s	   | $* 	
)%+')	
 	
 	
 	
  
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4% f9P9[ _e D &#3!/)
 
 
 	
r$   Tc	                 n     t                      j        |f|||||||d|	}
|d         dk    rd |
d<   |
S )N)r6   r:   r  r=   r;   r  r<   r   r  )r-   prepare_inputs_for_generation)r3   r  r6   r:   r  r=   r;   r<   r  r?   model_inputsr4   s              r%   r  z:Emu3ForConditionalGeneration.prepare_inputs_for_generation  sk     =uww<

+)')%%

 

 

 

 !!!+/L(r$   )NNNNNNNNNNr   )NNNNNTN)%r    r!   r"   rc  _tied_weights_keysr  r.   r  r  r/   r   r  r  r  propertyr  r  r  r  r   r   r   rJ   rL   r   rK   r   rM   r   rI   r   r   rN   r	   rH   r  rO   rP   s   @r%   r  r    s       *+/#(& &"    1 1 1/ / /ry    ( ( (( ( ( % % X% " " X" - - X-8 8 8  1548.21537+/59$(59-134X
 X
E,-X
 u01X
 el+	X

 !.X
 u/0X
 "%X
   12X
 D>X
 !!12X
 )*X
 c5</0X
 +,X
 
u,,	-X
 X
 X
 ^ X
z          r$   r  )r  r  r  r  r.  r  )Jr  	functoolsr   typingr   r   rJ   torch.nnr/   torch.nn.functional
functionalr   cache_utilsr   
generationr   modeling_outputsr	   modeling_utilsr
   processing_utilsr   utilsr   r   r   utils.deprecationr   chameleon.modeling_chameleonr   r   llama.modeling_llamar   r   r   r   r   siglip.modeling_siglipr   configuration_emu3r   r   r   
get_loggerr    loggerr   r(   r   rR   rx   rz   r   r   r   r   r   r   r   r   r   r   r   r  r  r$  r.  rk  r  r  r  r  r  __all__r#   r$   r%   <module>r     sQ  "  % % % % % % " " " " " " " "                             ) ) ) ) ) ) 6 6 6 6 6 6 - - - - - - & & & & & & > > > > > > > > > > 0 0 0 0 0 0        w v v v v v v v v v v v v v 4 4 4 4 4 4 K K K K K K K K K K 
	H	%	%	 	 	 	 	N 	 	 	
$ $ $ $ $( $ $ $N$ $ $ $ $ry $ $ $D	 	 	 	 	%H 	 	 	    29       bi   :! ! ! ! !29 ! ! !H    	   .    ")   &.( .( .( .( .(29 .( .( .(b<( <( <( <( <(29 <( <( <(~& & & & &o & & &V V V V V V V V    29   D8 8 8 8 8 8 8 8v7 7 7 7 7ry 7 7 7tC C C C Cry C C CLC C C C Cry C C CL   l2 l2 l2 l2 l2 l2 l2 l2^3% 3% 3% 3% 3% 3% 3% 3%l' ' ' ' '2I ' ' '

 

 

 

 

J 3 

 

 

    &(;_   8V V V V V# V V Vrh h h h h#6 h h hV  r$   