
     `iK                     6   d dl mZmZmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 ddl1m2Z2  e(            rddl3m4Z4  e*j5        e6          Z7e& G d de!                      Z8 G d dej9                  Z: G d dej9                  Z; ed           G d dej9                              Z< G d d ej9                  Z=d! Z>dEd"Z?d#ej@        d$eAd%ej@        fd&ZB	 dFd(ej9        d)ej@        d*ej@        d+ej@        d,eej@                 d-eCd.eCd/e#e%         fd0ZD G d1 d2ej9                  ZE G d3 d4ej9                  ZF G d5 d6e          ZG G d7 d8e8          ZH G d9 d:e          ZI G d; d<e8          ZJ e&d=>           G d? d@e8                      ZK e&dA>           G dB dCe8e2                      ZLg dDZMdS )G    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)use_kernel_forward_from_hub)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging)deprecate_kwarg   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   >    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZddgZdS )DiaPreTrainedModelconfigmodelT	input_idsDiaEncoderLayerDiaDecoderLayerN)__name__
__module____qualname__r#   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphmain_input_name_no_split_modules     x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/dia/modeling_dia.pyr)   r)   ?   sR         &*#N!!O*,=>r<   r)   c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )DiaMultiChannelEmbeddinga  In order to efficiently compute the audio embedding from the 9 different channels,
    we vectorize the embedding process by using a single embedding layer and an offset.
    Example:
    - num_embeds = 4
    - vocab_size = 8
    - num_channels = 3
    We would have offsets = [0, 8, 16]
    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
    then tokens = audio_codes + offsets
                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
    This allows us to use a single embedding layer for all channels.
    r*   c                 Z   t                                                       t          j        |j        |j        z  |j                  | _        |j        | _        |j        | _        t          j	        |j        t          j
                  |j        z  }|                     d|d           d S )NdtypeoffsetsF
persistent)super__init__r   	Embedding
vocab_sizenum_channelshidden_sizeembedtorcharangelongregister_buffer)selfr*   rC   	__class__s      r=   rG   z!DiaMultiChannelEmbedding.__init__Z   s    \&"3f6I"I6K]^^
!-"/,v2%*EEEHYYYEBBBBBr<   audio_codesreturnc                 $   || j                             |j                  z                       d          }|                     |                              |j        d         |j        d         d| j                  }|                    d          S )Nr"   r      dim)	rC   todevicesqueezerL   viewshaperK   sum)rQ   rS   tokensembedss       r=   forwardz DiaMultiChannelEmbedding.forwardb   sw    0B C CCLLQOOF##((a+:KA:NPRTXTdeezzaz   r<   )
r/   r0   r1   __doc__r$   rG   rM   Tensorrb   __classcell__rR   s   @r=   r?   r?   L   s|         C/ C C C C C C!5< !EL ! ! ! ! ! ! ! !r<   r?   c                   B     e Zd Z fdZdej        dej        fdZ xZS )DiaMLPc                 "   t                                                       || _        t          j        |j        d|j        z  d          | _        t          j        |j        |j        d          | _        t          |j
                 | _        d S )NrW   Fbias)rF   rG   r*   r   LinearrK   intermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fnrQ   r*   rR   s     r=   rG   zDiaMLP.__init__i   sz    If&8!f>V:V]bccc6#;V=OV[\\\#F$56r<   hidden_statesrT   c                     |                      |          }|                    dd          \  }}||                     |          z  }|                     |          S )NrW   rV   rX   )rn   chunkrq   ro   )rQ   rs   	up_statesgates       r=   rb   zDiaMLP.forwardq   sX    %%m44	#//!/44i 2 24 8 88	~~i(((r<   )r/   r0   r1   rG   rM   FloatTensorrb   re   rf   s   @r=   rh   rh   h   s`        7 7 7 7 7)U%6 )5;L ) ) ) ) ) ) ) )r<   rh   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )
DiaRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z9
        DiaRMSNorm is equivalent to T5LayerNorm
        N)rF   rG   r   	ParameterrM   onesweightvariance_epsilon)rQ   rK   epsrR   s      r=   rG   zDiaRMSNorm.__init__|   sD     	l5:k#:#:;; #r<   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )NrW   rV   T)keepdim)	rB   rZ   rM   float32powmeanrsqrtr   r   )rQ   rs   input_dtypevariances       r=   rb   zDiaRMSNorm.forward   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r<   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler   r^   r   rQ   s    r=   
extra_reprzDiaRMSNorm.extra_repr   s&    )**II$2GIIIr<   )r|   )r/   r0   r1   rG   rb   r   re   rf   s   @r=   r{   r{   z   sb        $ $ $ $ $ $; ; ;J J J J J J Jr<   r{   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )DiaRotaryEmbeddinginv_freqNr*   c                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultr   FrD   )rF   rG   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr*   r   rope_init_fnattention_scalingrP   r   original_inv_freq)rQ   r*   r[   r   rR   s       r=   rG   zDiaRotaryEmbedding.__init__   s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r<   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   rV   r"   mpscpuF)device_typeenabledrW   rX   rA   )r   floatexpandr^   rZ   r[   r   r   strrM   autocast	transposecatcosr   sinrB   )
rQ   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r=   rb   zDiaRotaryEmbedding.forward   s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/N)r/   r0   r1   rM   rd   r2   r#   rG   no_gradr   rb   re   rf   s   @r=   r   r      s         l/ /y / / / / / /" U]__< <  _< < < < <r<   r   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..NrV   rW   rX   )r^   rM   r   )r   x1x2s      r=   rotate_halfr      s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r<   c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r=   apply_rotary_pos_embr      sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr<   rs   n_reprT   c                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)r^   r   reshape)rs   r   batchnum_key_value_headsslenhead_dims         r=   	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr<           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )NrW   r   rV   )rY   rB   )ptrainingr"   )r   num_key_value_groupsrM   matmulr   r^   r   
functionalsoftmaxr   rZ   rB   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r=   eager_attention_forwardr      s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r<   c                   *    e Zd ZdZddeeef         dedef fdZ	 e
ddd	
          	 	 ddej        deej        ej        f         deej                 dee         deej                 dee         deej        ej        f         fd            Z xZS )DiaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperFr*   	layer_idx	is_causalc                    t                                                       || _        || _        |j        | _        | j        j        | _        | j        j        p| j        | _        | j        | j        z  | _        t          |d|j        | j        z            | _
        d| _        d| _        || _        t          j        | j        | j        | j
        z  d          | _        t          j        | j        | j        | j
        z  d          | _        t          j        | j        | j        | j
        z  d          | _        t          j        | j        | j
        z  | j        d          | _        d S )Nr   r"   r   Frj   )rF   rG   r*   r   rK   num_attention_heads	num_headsr   r   getattrr   r   attention_dropoutr   r   rl   q_projk_projv_projo_proj)rQ   r*   r   r   rR   s       r=   rG   zDiaSelfAttention.__init__   s1   "!-8#';#B#Tdn $(Nd6N$N!
F4F$.4XYY!$"i 0$.4=2PW\]]]i 0$2JT]2Zafgggi 0$2JT]2Zafgggi >@PW\]]]r<   past_key_valuepast_key_valuesz4.58)new_nameversionNrs   position_embeddingsr   cache_positionr   rT   c                 D   |j         d d         }g |d| j        R }|                     |                              |                              dd          }	|                     |                              |                              dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|                    |
|| j	        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        sdn| j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )NrV   r"   rW   )r   r   r   eagerr   )r   r   )r^   r   r   r]   r   r   r   r   updater   r   r*   _attn_implementationr   r   r   r   r   r   r   )rQ   rs   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r=   rb   zDiaSelfAttention.forward  s    $)#2#.88b8$-88{{=1166|DDNNqRSTT[[//44\BBLLQPQRR
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r<   )FNN)r/   r0   r1   rc   r   r%   r$   intboolrG   r!   rM   rd   r   r   r	   
LongTensorr   r   rb   re   rf   s   @r=   r   r      s2       GG^ ^u%57G%GH ^UX ^ei ^ ^ ^ ^ ^ ^$ _%0A6RRR ,059)) ))|)) #5<#=>)) !.	))
 "%)) !!12)) +,)) 
u|U\)	*)) )) )) SR)) )) )) )) ))r<   r   c                        e Zd ZdZdedef fdZ	 	 ddej        dej        de	ej                 d	e	e
         d
ee         deej        e	ej                 f         fdZ xZS )DiaCrossAttentionr   r*   r   c                    t                                                       || _        || _        |j        | _        |j        | _        | j        j        | _        | j        j        | _	        | j        | j	        z  | _
        |j        | _        d| _        d| _        d| _        t!          j        | j        | j        | j        z  d          | _        t!          j        | j        | j	        | j        z  d          | _        t!          j        | j        | j	        | j        z  d          | _        t!          j        | j        | j        z  | j        d          | _        d S )Nr"   r   Frj   )rF   rG   r*   r   rK   cross_hidden_sizecross_num_attention_headsr   cross_num_key_value_headsr   r   cross_head_dimr   r   r   r   r   rl   r   r   r   r   rQ   r*   r   rR   s      r=   rG   zDiaCrossAttention.__init__@  s&   "!-!'!9>#';#H $(Nd6N$N!-!$i 0$.4=2PW\]]]i 68PSWS`8`glmmmi 68PSWS`8`glmmmi >@PW\]]]r<   Nrs   cross_attention_statesr   r   r   rT   c                    |j         d d         }g |d| j        R }g |j         d d         d| j        R }|                     |                              |                              dd          }	||j                            | j                  nd}
|;|
r9|j        j	        | j                 j
        }|j        j	        | j                 j        }n|                     |                              |                              dd          }|                     |                              |                              dd          }|3|j                            ||| j                  \  }}d|j        | j        <   t          }| j        j        dk    rt$          | j        j                 } || |	|||fd| j        i|\  }}|                    g |dR                                           }|                     |          }||fS )NrV   r"   rW   FTr   r   )r^   r   r   r]   r   
is_updatedr   r   cross_attention_cachelayerskeysvaluesr   r   r   r   r*   r   r   r   r   r   r   )rQ   rs   r  r   r   r   r   r   cross_shaper   r  r   r   r   r   r   s                   r=   rb   zDiaCrossAttention.forwardS  s4    $)#2#.88b8$-88M.4SbS9M2Mt}MM{{=1166|DDNNqRSTTGVGb_/33DNCCChm
&:&(>EdnUZJ*@GW^LL%;<<AA+NNXXYZ\]^^J;;'=>>CCKPPZZ[\^_``L*+:+P+W+W N, ,(
L >B*4>:(?;+w66"9$+:Z"[$7$7%
 %
 L%
 %
 %
!\ "))*<K*<*<*<==HHJJkk+..L((r<   r   )r/   r0   r1   rc   r$   r   rG   rM   rd   r   r   r   r   r   rb   re   rf   s   @r=   r  r  =  s        GG^/ ^C ^ ^ ^ ^ ^ ^. 269=1) 1)|1) !&1) !.	1)
 ""561) -.1) 
u|Xel33	41) 1) 1) 1) 1) 1) 1) 1)r<   r  c                        e Zd Zdedef fdZ	 	 ddej        dee	ej        ej        f                  deej                 de
e         d	e	ej        eej                 f         f
d
Z xZS )r-   r*   r   c                    t                                                       t          |j        |j                  | _        t          ||d          | _        t          |j        |j                  | _        t          |          | _
        d S )Nr   Fr   )rF   rG   r{   rK   norm_epspre_sa_normr   self_attentionpost_sa_normrh   mlpr
  s      r=   rG   zDiaEncoderLayer.__init__  su    %f&8foNNN.vyERRR&v'9vOOO&>>r<   Nrs   r   r   r   rT   c                     |}|                      |          } | j        |f||d|\  }}||z   }|}|                     |          }|                     |          }	||	z   }||fS )Nr   r   )r  r  r  r  )
rQ   rs   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outs
             r=   rb   zDiaEncoderLayer.forward  s     !((77.Ad.A/
 3)/
 /
 	/
 /
++ !#33 ))-88((=)) 7*///r<   r   )r/   r0   r1   r%   r   rG   rM   rd   r   r   r   r   rb   re   rf   s   @r=   r-   r-     s        "/ "C " " " " " " LP15	0 0|0 &eEL%,,F&GH0 !.	0
 -.0 
u|Xel33	40 0 0 0 0 0 0 0r<   r-   c                        e Zd Zdef fdZee	 	 	 ddej        de	ej                 de	e
         de	e
         d	ee         d
eeef         fd                        Zdeej        df         dej        fdZ xZS )
DiaEncoderr*   c                 x   t                                                     | _        t          j        j        j                  | _        t          j        fdt          j
                  D                       | _        t          j        j                  | _        t                    | _        d S )Nc                 0    g | ]}t          |          S r;   )r-   .0r   r*   s     r=   
<listcomp>z'DiaEncoder.__init__.<locals>.<listcomp>  #    aaaI_VY//aaar<   r  )rF   rG   r*   r   rH   rI   rK   	embedding
ModuleListrangenum_hidden_layersr  r{   r  normr   rotary_embeddingsrr   s    `r=   rG   zDiaEncoder.__init__  s       f&79KLLmaaaavG_A`A`aaa
 
 v1vGGG	!3F!;!;r<   NFr,   r   output_attentionsoutput_hidden_statesr   rT   c                    |                      |          }t          j        |j        d         |j                  d d d f         }|                     ||          }|                     ||          }|rdnd }	|rdnd }
| j        D ],}|r|	|fz   }	 ||f||d|}|d         }|r|
|d         fz   }
-|                     |          }|r|	|fz  }	t          ||	|
          S )NrV   r[   r;   r  r   r"   last_hidden_staters   
attentions)
r+  rM   rN   r^   r[   r0  _update_full_maskr  r/  r   )rQ   r,   r   r1  r2  r   rs   r   r   encoder_statesall_attentionsencoder_layerlayer_outputss                r=   rb   zDiaEncoder.forward  sZ    y11
 |IOB$7	@PQQQRVXYXYXYRYZ"44]LQQ//
 

  4=0:d![ 	F 	FM# C!/=2B!B)M$7-  	 M *!,M  F!/=3C2E!E		-00 	/}..N+>Vd
 
 
 	
r<   inputs_embedsc                 *   || j         j        dk    r	d|v r|nd }nw| j         j        dk    rt          ||j                  }nQ| j         j        dk    r,t	          |t
          j                  rt          |d          }nt          ||j                  }|S )Nflash_attention_2r   sdpaflex_attentionFr  	r*   r   r   rB   r   rM   rd   r'   r   )rQ   r   r=  s      r=   r8  zDiaEncoder._update_full_mask  s    
 %{/3FFF343F3FD1V;; "E^UbUh!i!i15EEEnel;; b%@[`%a%a%aN "<NML_!`!`r<   )NFF)r/   r0   r1   r%   rG   r   r   rM   rd   r   r  r   r   r   r   r   rb   r8  re   rf   s   @r=   r$  r$    s       	</ 	< 	< 	< 	< 	< 	<  26,1/4.
 .
<.
 !..
 $D>	.

 'tn.
 -..
 
%	&.
 .
 .
  ^.
belD01 |       r<   r$  c                   D    e Zd Zdedef fdZ	 	 	 	 	 	 ddej        dee	ej        ej        f                  deej                 deej                 d	eej                 d
ee
         deej                 de	ej        eej                 eej                 f         fdZ xZS )r.   r*   r   c                    t                                                       |j        | _        t	          ||d          | _        t          ||          | _        t          |j        |j	                  | _
        t          |j        |j	                  | _        t          |j        |j	                  | _        t          |          | _        d S )NTr  r  )rF   rG   rK   	embed_dimr   r  r  cross_attentionr{   r  r  pre_ca_normpre_mlp_normrh   r  r
  s      r=   rG   zDiaDecoderLayer.__init__  s    +.vyDQQQ0CC%f&8foNNN%f&8foNNN&v'9vOOO&>>r<   Nrs   r   r   encoder_hidden_statesencoder_attention_maskr   r   rT   c                 p   |}	t          |	t                    r|	j        }	|}
|                     |          } | j        ||||	fd|i|\  }}|
|z   }|}
|                     |          } | j        ||f||d|\  }}|
|z   }|}
|                     |          }|                     |          }|
|z   }|||fS )Nr   )r   r   )	r   r   self_attention_cacher  r  rG  rF  rH  r  )rQ   rs   r   r   rI  rJ  r   r   r   self_attn_cacher  r  r   r!  cross_statescross_attn_weightsr"  s                    r=   rb   zDiaDecoderLayer.forward	  s.    *o':;; 	C-BO ((77.Ad.A 	/
 	/
 *	/
 	/
 	/
++ !#33 ((77+?4+?!,
 2+	,
 ,

 ,
 ,
(( !</ ))-88((=)) 7*/1CCCr<   )NNNNNN)r/   r0   r1   r$   r   rG   rM   rd   r   r   r   r  rb   re   rf   s   @r=   r.   r.     s.       "/ "C " " " " " " LP158<9=9=59-D -D|-D &eEL%,,F&GH-D !.	-D
  (5-D !) 6-D ""56-D !!12-D 
u|Xel3Xel5KK	L-D -D -D -D -D -D -D -Dr<   r.   c                       e Zd ZdZdef fdZee	 	 	 	 	 	 	 	 ddej	        de
ej                 de
ej	                 d	e
ej                 d
e
ej                 de
e         de
e         de
e         de
ej                 deeef         fd                        Zd	eej	        df         d
eej	        df         dej        dej	        fdZ xZS )
DiaDecoderz-Transformer Decoder Stack using DenseGeneral.r*   c                 z   t                                                     j        | _        j        | _        t	                    | _        t                    | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        d S )Nc                 0    g | ]}t          |          S r;   )r.   r'  s     r=   r)  z'DiaDecoder.__init__.<locals>.<listcomp>C  r*  r<   r  )rF   rG   rJ   rI   r?   
embeddingsr   r0  r   r,  r-  r.  r  r{   rK   r  r/  rr   s    `r=   rG   zDiaDecoder.__init__<  s       "/ +26::!3F!;!;maaaavG_A`A`aaa
 
 v1vGGG			r<   NFr,   r   r   rI  rJ  r   r1  r2  r   rT   c
                    |                                 dd         \  }}||                                nd}|	t          j        |||z   |j                  }	||	dddf         }|                     |          }|                     ||          }|/t                      s!||z   }t          j        |||j                  }t          | j
        |||	||          }|                     |||j        dd         |          }|rdnd}|rdnd}|r|dnd}| j        D ]>}|r||fz  } |||||f|||	d|
}|d         }|r||d	         fz   }|||d         fz   }?|                     |          }|r||fz  }t          |||||
          S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

            [What are input IDs?](../glossary#input-ids)
        NrV   r   r4  )r*   input_embedsr   r   r   r   rW   r;   )rJ  r   r   r"   )r6  r   rs   r7  cross_attentions)sizeget_seq_lengthrM   rN   r[   rT  r0  r   r   r   r*   _update_cross_attn_maskr^   r  r/  r   )rQ   r,   r   r   rI  rJ  r   r1  r2  r   r   
batch_size
seq_lengthpast_key_values_lengthrs   r   mask_seq_lengthall_hidden_statesall_self_attnsall_cross_attentionslayerr<  s                         r=   rb   zDiaDecoder.forwardG  sd   , "+!1!1#2#!6
JETE`!?!?!A!A!Afg!"\&(>(KT]Td  N )$'2L 	22"44]LQQ!*B*D*D!4zAO"Z
OIL\]]]N+;&))+%
 
 
 "&!=!=!"#	"
 "
 #7@BBD0:d&7h<Q<]rrdh[ 	V 	VE# 6!m%55!!E#%		
 (> /-	 	 	 	M *!,M  V!/=3C2E!E(4+?=QRCSBU+U(		-00 	2-!118+++%1
 
 
 	
r<   r   r=  c                 \   ||| j         j        dk    r	d|v r|nd }n| j         j        dk    rt          ||j        |d                   }n`| j         j        dk    r3t	          |t
          j                  rt          ||d         d          }nt          ||j        |d                   }|S )	Nr?  r   r@  rV   )tgt_lenrA  F)query_lengthr   rB  )rQ   rI  rJ  r   r=  s        r=   rZ  z"DiaDecoder._update_cross_attn_mask  s     !,1G1S{/3FFFCDH^C^C^)?)?dh&&1V;; *M*!''O* * *&&
 15EEE4elCC -H.%0_"'. . .* *D*M,?UW* * *& &%r<   )NNNNNFFN)r/   r0   r1   rc   r$   rG   r   r   rM   rd   r   r  rx   r   r  r   r   r   rb   SizerZ  re   rf   s   @r=   rQ  rQ  9  s       77	H/ 	H 	H 	H 	H 	H 	H  4815=A=A9=,1/459Z
 Z
<Z
 u/0Z
 !.	Z

  ((9:Z
 !))9 :Z
 ""56Z
 $D>Z
 'tnZ
 !!12Z
 
8%?	@Z
 Z
 Z
  ^Z
z!&$U\4%78!& !&elD&8 9!& Z	!&
 |!& !& !& !& !& !& !& !&r<   rQ  z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                   z    e Zd Zdef fdZd Zee	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 dee	j
                 d	ee	j
                 d
eeeef                  dee         dee         dee         dee         dee	j
                 deeef         fd                        Z xZS )DiaModelr*   c                     t                                          |           || _        t          |j                  | _        t          |j                  | _        | 	                                 d S r   )
rF   rG   r*   r$  encoder_configencoderrQ  decoder_configdecoder	post_initrr   s     r=   rG   zDiaModel.__init__  s\       !&"788!&"788r<   c                     | j         S r   )rl  r   s    r=   get_encoderzDiaModel.get_encoder  s
    |r<   Nr,   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr   	use_cacher1  r2  r   rT   c                    ||t          d          |	|	n| j        j        }	|
|
n| j        j        }
||n| j        j        }| j        r%| j        r|rt                              d           d}|r8|6t          t          | j                  t          | j                            }| | j        d|||	|
d|}nct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          d	k    r|d	         nd
          }|d         j        d         d| j        j        j        }}}|)t%          j        |d|f| j        j        | j                  }|j        d	k    r+|                    |||                              dd	          } | j        d||||d         |||	|
||d
|}t5          |j        |j        |j        |j        |j        |d         |j        |j                  S )a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r*   )r,   r   r1  r2  r   r"   rW   r5  rV   )rX  
fill_valuer[   )
r,   r   r   rI  rJ  r   r1  r2  rv  r   )r6  r   decoder_hidden_statesdecoder_attentionsrW  encoder_last_hidden_staterI  encoder_attentionsr;   ) 
ValueErrorr*   r1  r2  rv  is_gradient_checkpointingr   loggerwarning_oncer   r
   rl  r   r   lenr^   rm  rJ   rM   fullbos_token_idr[   ndimr   r   rn  r   r6  r   rs   r7  rW  )rQ   r,   r   rr  rs  rt  ru  r   rv  r1  r2  r   r   bszseq_lenchannelsdecoder_outputss                    r=   rb   zDiaModel.forward  s   N !8j   2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	) 	"dm 	" "##p   "	 	v01,dk2R2R2RT`hlhsTtTtTtuuO"*dl #-"3%9	 
  OO O_== 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O #2!"4":1"=r4;C]CjhW$ %
1h'DK4LUYU`! ! ! !Q&& 1 9 9#x Q Q [ [\]_` a a&$, 
'-1"1!"4#1+/!5)
 
 
 
 "-?+;"1"?.9,=&5a&8"1"?.9	
 	
 	
 		
r<   )NNNNNNNNNNN)r/   r0   r1   r#   rG   rq  r   r   r   rM   r  r   r   r   r   r  r   rb   re   rf   s   @r=   ri  ri    s       y          15598<;?=ACG9=$(,0/359k
 k
E,-k
 !!12k
 $E$45	k

 'u'78k
 !))9 :k
 "%(>"?@k
 ""56k
 D>k
 $D>k
 'tnk
 !!12k
 
u((	)k
 k
 k
  ^k
 k
 k
 k
 k
r<   ri  zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                       e Zd ZdZdef fdZd Zd Zee		 	 	 	 	 	 	 	 	 	 	 	 dde
ej                 de
ej                 d	e
ej                 d
e
ej                 de
ej                 de
eeef                  de
e         de
e         de
e         de
e         de
ej                 de
ej                 deeef         fd                        Z xZS )DiaForConditionalGenerationr+   r*   c                 `   t                                          |           || _        t          |          | _        |j        j        | _        |j        j        | _        t          j	        |j        j
        | j        | j        z  d          | _        d| _        |                                  d S )NFrj   ForMaskedLM)rF   rG   r*   ri  r+   rm  rJ   rI   r   rl   rK   logits_dense	loss_typero  rr   s     r=   rG   z$DiaForConditionalGeneration.__init__R  s       f%%
"1> /:I!-0ADO0S[`
 
 
 ' 	r<   c                 4    | j                                         S r   )r+   rq  r   s    r=   rq  z'DiaForConditionalGeneration.get_encodera      z%%'''r<   c                 4    | j                                         S r   )r+   get_decoderr   s    r=   r  z'DiaForConditionalGeneration.get_decoderd  r  r<   Nr,   r   rr  rs  rt  ru  r   rv  r1  r2  labelsr   rT   c                     | j         d	|||||||||	|
|d|}|d         }|j        d         }|                     |                              |d| j        | j        f                              dd                                                              || j        z  d| j                  }d}| | j        d	||| j        d|}t          |||j
        |j        |j        |j        |j        |j        |j        	  	        S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in
            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
            are ignored (masked).
        )r,   r   rr  rs  rt  ru  r   rv  r1  r2  r   r   rV   r"   rW   N)logitsr  rI   )	lossr  r   ry  rz  rW  r{  rI  r|  r;   )r+   r^   r  r]   rJ   rI   r   r   loss_functionr   r   ry  rz  rW  r{  rI  r|  )rQ   r,   r   rr  rs  rt  ru  r   rv  r1  r2  r  r   r   outputsr6  r[  audio_logitsr  s                      r=   rb   z#DiaForConditionalGeneration.forwardg  sE   X $* 
)/!5#9++/!5)
 
 
 
 $AJ&,Q/
 /00T:r4#4doFGGYq!__Z\\T*t00"doFF 	 %4%o\&UYUdoohnooD#3")"?&9$5&-&G")"?&9

 

 

 
	
r<   )NNNNNNNNNNNN)r/   r0   r1   r3   r#   rG   rq  r  r   r   r   rM   r  r   r   r   r   r  r   rb   re   rf   s   @r=   r  r  J  s         y      ( ( (( ( (  15598<;?=ACG9=$(,0/3-159R
 R
E,-R
 !!12R
 $E$45	R

 'u'78R
 !))9 :R
 "%(>"?@R
 ""56R
 D>R
 $D>R
 'tnR
 )*R
 !!12R
 
uo%	&R
 R
 R
  ^R
 R
 R
 R
 R
r<   r  )ri  r)   r  )Nr"   )r   )Ntypingr   r   r   rM   r   activationsr   cache_utilsr	   r
   r   integrationsr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r    utils.deprecationr!   configuration_diar#   r$   r%   generation_diar&   integrations.flex_attentionr'   
get_loggerr/   r  r)   Moduler?   rh   r{   r   r   r   rd   r   r   r   r   r   r  r-   r$  r.   rQ  ri  r  __all__r;   r<   r=   <module>r     s  , - , , , , , , , , ,        ! ! ! ! ! ! C C C C C C C C C C 7 7 7 7 7 7 / / / / / / g g g g g g g g B B B B B B 9 9 9 9 9 9            L K K K K K K K F F F F F F F F & & & & & &                1 0 0 0 0 0 L L L L L L L L L L . . . . . .  !! KJJJJJJ 
	H	%	% 	? 	? 	? 	? 	? 	? 	? 	?! ! ! ! !ry ! ! !8) ) ) ) )RY ) ) )$ Y''J J J J J J J ('J(!< !< !< !< !< !< !< !<H( ( (   6	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %4?) ?) ?) ?) ?)ry ?) ?) ?)DG) G) G) G) G)	 G) G) G)T0 0 0 0 00 0 0 0BS S S S S# S S Sl8D 8D 8D 8D 8D0 8D 8D 8DvN& N& N& N& N&# N& N& N&b   
x
 x
 x
 x
 x
! x
 x
 
x
v   
l
 l
 l
 l
 l
"46H l
 l
 
l
^ L
K
Kr<   