
     `i                     ,   d Z ddlmZmZmZ ddlZddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/  e             rddl0m1Z1  e"j2        e3          Z4e G d de                      Z5 G d dej6                  Z7 G d de)          Z8 G d de%          Z9 G d de&          Z: G d  d!e$          Z; G d" d#ej6                  Z< G d$ d%e          Z= G d& d'e5          Z> G d( d)e          Z? G d* d+e5          Z@ ed,-           G d. d/e5                      ZA ed0-           G d1 d2e5e/                      ZBg d3ZCdS )4zPyTorch Dia model.    )CallableOptionalUnionN)nn   )DynamicCacheEncoderDecoderCache)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )LlamaAttentionLlamaRMSNormLlamaRotaryEmbeddingeager_attention_forward)Phi3MLP   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   >    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZddgZdS )DiaPreTrainedModelconfigmodelT	input_idsDiaEncoderLayerDiaDecoderLayerN)__name__
__module____qualname__r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphmain_input_name_no_split_modules     w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/dia/modular_dia.pyr(   r(   9   sR         &*#N!!O*,=>r;   r(   c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )DiaMultiChannelEmbeddinga  In order to efficiently compute the audio embedding from the 9 different channels,
    we vectorize the embedding process by using a single embedding layer and an offset.
    Example:
    - num_embeds = 4
    - vocab_size = 8
    - num_channels = 3
    We would have offsets = [0, 8, 16]
    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
    then tokens = audio_codes + offsets
                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
    This allows us to use a single embedding layer for all channels.
    r)   c                 Z   t                                                       t          j        |j        |j        z  |j                  | _        |j        | _        |j        | _        t          j	        |j        t          j
                  |j        z  }|                     d|d           d S )N)dtypeoffsetsF)
persistent)super__init__r   	Embedding
vocab_sizenum_channelshidden_sizeembedtorcharangelongregister_buffer)selfr)   rA   	__class__s      r<   rD   z!DiaMultiChannelEmbedding.__init__T   s    \&"3f6I"I6K]^^
!-"/,v2%*EEEHYYYEBBBBBr;   audio_codesreturnc                 $   || j                             |j                  z                       d          }|                     |                              |j        d         |j        d         d| j                  }|                    d          S )Nr!   r   r   )dim)	rA   todevicesqueezerI   viewshaperH   sum)rN   rP   tokensembedss       r<   forwardz DiaMultiChannelEmbedding.forward\   sw    0B C CCLLQOOF##((a+:KA:NPRTXTdeezzaz   r;   )
r.   r/   r0   __doc__r#   rD   rJ   Tensorr]   __classcell__rO   s   @r<   r>   r>   F   s|         C/ C C C C C C!5< !EL ! ! ! ! ! ! ! !r;   r>   c                       e Zd ZdS )DiaMLPNr.   r/   r0   r:   r;   r<   rc   rc   b           Dr;   rc   c                       e Zd ZdS )
DiaRMSNormNrd   r:   r;   r<   rg   rg   f   re   r;   rg   c                       e Zd ZdS )DiaRotaryEmbeddingNrd   r:   r;   r<   ri   ri   j   re   r;   ri   c                   8    e Zd ZdZddeeef         dedefdZ	dS )	DiaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperFr)   	layer_idx	is_causalc                    t           j                            |            || _        || _        |j        | _        | j        j        | _        | j        j        p| j        | _        | j        | j        z  | _	        t          |d|j        | j        z            | _        d| _        d| _        || _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        z  | j        d          | _        d S )Nhead_dimr!           Fbias)r   ModulerD   r)   rm   rH   num_attention_heads	num_headsnum_key_value_headsnum_key_value_groupsgetattrrp   scalingattention_dropoutrn   Linearq_projk_projv_projo_proj)rN   r)   rm   rn   s       r<   rD   zDiaSelfAttention.__init__q   s0   
	4   "!-8#';#B#Tdn $(Nd6N$N!
F4F$.4XYY!$"i 0$.4=2PW\]]]i 0$2JT]2Zafgggi 0$2JT]2Zafgggi >@PW\]]]r;   N)F)
r.   r/   r0   r^   r   r$   r#   intboolrD   r:   r;   r<   rk   rk   n   s`        GG^ ^u%57G%GH ^UX ^ei ^ ^ ^ ^ ^ ^r;   rk   c                        e Zd ZdZdedef fdZ	 	 ddej        dej        de	ej                 d	e	e
         d
ee         deej        e	ej                 f         fdZ xZS )DiaCrossAttentionrl   r)   rm   c                    t                                                       || _        || _        |j        | _        |j        | _        | j        j        | _        | j        j        | _	        | j        | j	        z  | _
        |j        | _        d| _        d| _        d| _        t!          j        | j        | j        | j        z  d          | _        t!          j        | j        | j	        | j        z  d          | _        t!          j        | j        | j	        | j        z  d          | _        t!          j        | j        | j        z  | j        d          | _        d S )Nr!   rq   Frr   )rC   rD   r)   rm   rH   cross_hidden_sizecross_num_attention_headsrv   cross_num_key_value_headsrw   rx   cross_head_dimrp   rz   r{   rn   r   r|   r}   r~   r   r   rN   r)   rm   rO   s      r<   rD   zDiaCrossAttention.__init__   s&   "!-!'!9>#';#H $(Nd6N$N!-!$i 0$.4=2PW\]]]i 68PSWS`8`glmmmi 68PSWS`8`glmmmi >@PW\]]]r;   Nhidden_statescross_attention_statesattention_maskpast_key_valueskwargsrQ   c                    |j         d d         }g |d| j        R }g |j         d d         d| j        R }|                     |                              |                              dd          }	||j                            | j                  nd}
|;|
r9|j        j	        | j                 j
        }|j        j	        | j                 j        }n|                     |                              |                              dd          }|                     |                              |                              dd          }|3|j                            ||| j                  \  }}d|j        | j        <   t          }| j        j        dk    rt$          | j        j                 } || |	|||fd| j        i|\  }}|                    g |dR                                           }|                     |          }||fS )NrS   r!   r   FTeagerrz   )rY   rp   r}   rX   	transpose
is_updatedgetrm   cross_attention_cachelayerskeysvaluesr~   r   updater   r)   _attn_implementationr   rz   reshape
contiguousr   )rN   r   r   r   r   r   input_shapehidden_shapecross_shapequery_statesr   
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                   r<   r]   zDiaCrossAttention.forward   s4    $)#2#.88b8$-88M.4SbS9M2Mt}MM{{=1166|DDNNqRSTTGVGb_/33DNCCChm
&:&(>EdnUZJ*@GW^LL%;<<AA+NNXXYZ\]^^J;;'=>>CCKPPZZ[\^_``L*+:+P+W+W N, ,(
L >B*4>:(?;+w66"9$+:Z"[$7$7%
 %
 L%
 %
 %
!\ "))*<K*<*<*<==HHJJkk+..L((r;   NN)r.   r/   r0   r^   r#   r   rD   rJ   r_   r   r	   r   r   tupler]   r`   ra   s   @r<   r   r      s        GG^/ ^C ^ ^ ^ ^ ^ ^. 269=1) 1)|1) !&1) !.	1)
 ""561) -.1) 
u|Xel33	41) 1) 1) 1) 1) 1) 1) 1)r;   r   c                        e Zd Zdedef fdZ	 	 ddej        dee	ej        ej        f                  deej                 de
e         d	e	ej        eej                 f         f
d
Z xZS )r,   r)   rm   c                    t                                                       t          |j        |j                  | _        t          ||d          | _        t          |j        |j                  | _        t          |          | _
        d S )NepsFrn   )rC   rD   rg   rH   norm_epspre_sa_normrk   self_attentionpost_sa_normrc   mlpr   s      r<   rD   zDiaEncoderLayer.__init__   su    %f&8foNNN.vyERRR&v'9vOOO&>>r;   Nr   position_embeddingsr   r   rQ   c                     |}|                      |          } | j        |f||d|\  }}||z   }|}|                     |          }|                     |          }	||	z   }||fS )Nr   r   )r   r   r   r   )
rN   r   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outs
             r<   r]   zDiaEncoderLayer.forward   s     !((77.Ad.A/
 3)/
 /
 	/
 /
++ !#33 ))-88((=)) 7*///r;   r   )r.   r/   r0   r$   r   rD   rJ   r_   r   r   r   r   r]   r`   ra   s   @r<   r,   r,      s        "/ "C " " " " " " LP15	0 0|0 &eEL%,,F&GH0 !.	0
 -.0 
u|Xel33	40 0 0 0 0 0 0 0r;   r,   c                        e Zd Zdef fdZee	 	 	 ddej        de	ej                 de	e
         de	e
         d	ee         d
eeef         fd                        Zdeej        df         dej        fdZ xZS )
DiaEncoderr)   c                 x   t                                                     | _        t          j        j        j                  | _        t          j        fdt          j
                  D                       | _        t          j        j                  | _        t                    | _        d S )Nc                 0    g | ]}t          |          S r:   )r,   .0rm   r)   s     r<   
<listcomp>z'DiaEncoder.__init__.<locals>.<listcomp>   #    aaaI_VY//aaar;   r   )rC   rD   r)   r   rE   rF   rH   	embedding
ModuleListrangenum_hidden_layersr   rg   r   normri   rotary_embeddingsrN   r)   rO   s    `r<   rD   zDiaEncoder.__init__   s       f&79KLLmaaaavG_A`A`aaa
 
 v1vGGG	!3F!;!;r;   NFr+   r   output_attentionsoutput_hidden_statesr   rQ   c                    |                      |          }t          j        |j        d         |j                  d d d f         }|                     ||          }|                     ||          }|rdnd }	|rdnd }
| j        D ],}|r|	|fz   }	 ||f||d|}|d         }|r|
|d         fz   }
-|                     |          }|r|	|fz  }	t          ||	|
          S )NrS   rV   r:   r   r   r!   last_hidden_stater   
attentions)
r   rJ   rK   rY   rV   r   _update_full_maskr   r   r   )rN   r+   r   r   r   r   r   position_idsr   encoder_statesall_attentionsencoder_layerlayer_outputss                r<   r]   zDiaEncoder.forward   sZ    y11
 |IOB$7	@PQQQRVXYXYXYRYZ"44]LQQ//
 

  4=0:d![ 	F 	FM# C!/=2B!B)M$7-  	 M *!,M  F!/=3C2E!E		-00 	/}..N+>Vd
 
 
 	
r;   inputs_embedsc                 *   || j         j        dk    r	d|v r|nd }nw| j         j        dk    rt          ||j                  }nQ| j         j        dk    r,t	          |t
          j                  rt          |d          }nt          ||j                  }|S )Nflash_attention_2r   sdpaflex_attentionFr   	r)   r   r   r@   
isinstancerJ   r_   r&   r   )rN   r   r   s      r<   r   zDiaEncoder._update_full_mask.  s    
 %{/3FFF343F3FD1V;; "E^UbUh!i!i15EEEnel;; b%@[`%a%a%aN "<NML_!`!`r;   )NFF)r.   r/   r0   r$   rD   r   r   rJ   r_   r   r   r   r   r   r   r   r]   r   r`   ra   s   @r<   r   r      s       	</ 	< 	< 	< 	< 	< 	<  26,1/4.
 .
<.
 !..
 $D>	.

 'tn.
 -..
 
%	&.
 .
 .
  ^.
belD01 |       r;   r   c                   D    e Zd Zdedef fdZ	 	 	 	 	 	 ddej        dee	ej        ej        f                  deej                 deej                 d	eej                 d
ee
         deej                 de	ej        eej                 eej                 f         fdZ xZS )r-   r)   rm   c                    t                                                       |j        | _        t	          ||d          | _        t          ||          | _        t          |j        |j	                  | _
        t          |j        |j	                  | _        t          |j        |j	                  | _        t          |          | _        d S )NTr   r   )rC   rD   rH   	embed_dimrk   r   r   cross_attentionrg   r   r   pre_ca_normpre_mlp_normrc   r   r   s      r<   rD   zDiaDecoderLayer.__init__F  s    +.vyDQQQ0CC%f&8foNNN%f&8foNNN&v'9vOOO&>>r;   Nr   r   r   encoder_hidden_statesencoder_attention_maskr   cache_positionrQ   c                 p   |}	t          |	t                    r|	j        }	|}
|                     |          } | j        ||||	fd|i|\  }}|
|z   }|}
|                     |          } | j        ||f||d|\  }}|
|z   }|}
|                     |          }|                     |          }|
|z   }|||fS )Nr   )r   r   )	r   r	   self_attention_cacher   r   r   r   r   r   )rN   r   r   r   r   r   r   r   r   self_attn_cacher   r   r   r   cross_statescross_attn_weightsr   s                    r<   r]   zDiaDecoderLayer.forwardP  s.    *o':;; 	C-BO ((77.Ad.A 	/
 	/
 *	/
 	/
 	/
++ !#33 ((77+?4+?!,
 2+	,
 ,

 ,
 ,
(( !</ ))-88((=)) 7*/1CCCr;   )NNNNNN)r.   r/   r0   r#   r   rD   rJ   r_   r   r   r	   
LongTensorr]   r`   ra   s   @r<   r-   r-   E  s.       "/ "C " " " " " " LP158<9=9=59-D -D|-D &eEL%,,F&GH-D !.	-D
  (5-D !) 6-D ""56-D !!12-D 
u|Xel3Xel5KK	L-D -D -D -D -D -D -D -Dr;   r-   c                       e Zd ZdZdef fdZee	 	 	 	 	 	 	 	 ddej	        de
ej                 de
ej	                 d	e
ej                 d
e
ej                 de
e         de
e         de
e         de
ej                 deeef         fd                        Zd	eej	        df         d
eej	        df         dej        dej	        fdZ xZS )
DiaDecoderz-Transformer Decoder Stack using DenseGeneral.r)   c                 z   t                                                     j        | _        j        | _        t	                    | _        t                    | _        t          j	        fdt          j                  D                       | _        t          j        j                  | _        d S )Nc                 0    g | ]}t          |          S r:   )r-   r   s     r<   r   z'DiaDecoder.__init__.<locals>.<listcomp>  r   r;   r   )rC   rD   rG   rF   r>   
embeddingsri   r   r   r   r   r   r   rg   rH   r   r   r   s    `r<   rD   zDiaDecoder.__init__  s       "/ +26::!3F!;!;maaaavG_A`A`aaa
 
 v1vGGG			r;   NFr+   r   r   r   r   r   r   r   r   rQ   c
                    |                                 dd         \  }}||                                nd}|	t          j        |||z   |j                  }	||	dddf         }|                     |          }|                     ||          }|/t                      s!||z   }t          j        |||j                  }t          | j
        |||	||          }|                     |||j        dd         |          }|rdnd}|rdnd}|r|dnd}| j        D ]>}|r||fz  } |||||f|||	d|
}|d         }|r||d	         fz   }|||d         fz   }?|                     |          }|r||fz  }t          |||||
          S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

            [What are input IDs?](../glossary#input-ids)
        NrS   r   r   )r)   input_embedsr   r   r   r   r   r:   )r   r   r   r!   )r   r   r   r   cross_attentions)sizeget_seq_lengthrJ   rK   rV   r   r   r   onesr
   r)   _update_cross_attn_maskrY   r   r   r   )rN   r+   r   r   r   r   r   r   r   r   r   
batch_size
seq_lengthpast_key_values_lengthr   r   mask_seq_lengthall_hidden_statesall_self_attnsall_cross_attentionslayerr   s                         r<   r]   zDiaDecoder.forward  sd   , "+!1!1#2#!6
JETE`!?!?!A!A!Afg!"\&(>(KT]Td  N )$'2L 	22"44]LQQ!*B*D*D!4zAO"Z
OIL\]]]N+;&))+%
 
 
 "&!=!=!"#	"
 "
 #7@BBD0:d&7h<Q<]rrdh[ 	V 	VE# 6!m%55!!E#%		
 (> /-	 	 	 	M *!,M  V!/=3C2E!E(4+?=QRCSBU+U(		-00 	2-!118+++%1
 
 
 	
r;   r   r   c                 \   ||| j         j        dk    r	d|v r|nd }n| j         j        dk    rt          ||j        |d                   }n`| j         j        dk    r3t	          |t
          j                  rt          ||d         d          }nt          ||j        |d                   }|S )	Nr   r   r   rS   )tgt_lenr   F)query_lengthrn   r   )rN   r   r   r   r   s        r<   r   z"DiaDecoder._update_cross_attn_mask  s     !,1G1S{/3FFFCDH^C^C^)?)?dh&&1V;; *M*!''O* * *&&
 15EEE4elCC -H.%0_"'. . .* *D*M,?UW* * *& &%r;   )NNNNNFFN)r.   r/   r0   r^   r#   rD   r   r   rJ   r_   r   r   FloatTensorr	   r   r   r   r   r]   Sizer   r`   ra   s   @r<   r   r     s       77	H/ 	H 	H 	H 	H 	H 	H  4815=A=A9=,1/459Z
 Z
<Z
 u/0Z
 !.	Z

  ((9:Z
 !))9 :Z
 ""56Z
 $D>Z
 'tnZ
 !!12Z
 
8%?	@Z
 Z
 Z
  ^Z
z!&$U\4%78!& !&elD&8 9!& Z	!&
 |!& !& !& !& !& !& !& !&r;   r   z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                   z    e Zd Zdef fdZd Zee	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 dee	j
                 d	ee	j
                 d
eeeef                  dee         dee         dee         dee         dee	j
                 deeef         fd                        Z xZS )DiaModelr)   c                     t                                          |           || _        t          |j                  | _        t          |j                  | _        | 	                                 d S N)
rC   rD   r)   r   encoder_configencoderr   decoder_configdecoder	post_initr   s     r<   rD   zDiaModel.__init__  s\       !&"788!&"788r;   c                     | j         S r	  )r  rN   s    r<   get_encoderzDiaModel.get_encoder  s
    |r;   Nr+   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr   	use_cacher   r   r   rQ   c                    ||t          d          |	|	n| j        j        }	|
|
n| j        j        }
||n| j        j        }| j        r%| j        r|rt                              d           d}|r8|6t          t          | j                  t          | j                            }| | j        d|||	|
d|}nct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          d	k    r|d	         nd
          }|d         j        d         d| j        j        j        }}}|)t%          j        |d|f| j        j        | j                  }|j        d	k    r+|                    |||                              dd	          } | j        d||||d         |||	|
||d
|}t5          |j        |j        |j        |j        |j        |d         |j        |j                  S )a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r)   )r+   r   r   r   r   r!   r   r   rS   )r   
fill_valuerV   )
r+   r   r   r   r   r   r   r   r  r   )r   r   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentionsr:   ) 
ValueErrorr)   r   r   r  is_gradient_checkpointingtrainingloggerwarning_oncer	   r   r  r   r   lenrY   r  rG   rJ   fullbos_token_idrV   ndimr   r   r  r   r   r   r   r   r   )rN   r+   r   r  r  r  r  r   r  r   r   r   r   bszseq_lenchannelsdecoder_outputss                    r<   r]   zDiaModel.forward!  s   N !8j   2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	) 	"dm 	" "##p   "	 	v01,dk2R2R2RT`hlhsTtTtTtuuO"*dl #-"3%9	 
  OO O_== 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O #2!"4":1"=r4;C]CjhW$ %
1h'DK4LUYU`! ! ! !Q&& 1 9 9#x Q Q [ [\]_` a a&$, 
'-1"1!"4#1+/!5)
 
 
 
 "-?+;"1"?.9,=&5a&8"1"?.9	
 	
 	
 		
r;   )NNNNNNNNNNN)r.   r/   r0   r"   rD   r  r   r   r   rJ   r   r   r   r   r	   r   r   r]   r`   ra   s   @r<   r  r    s       y          15598<;?=ACG9=$(,0/359k
 k
E,-k
 !!12k
 $E$45	k

 'u'78k
 !))9 :k
 "%(>"?@k
 ""56k
 D>k
 $D>k
 'tnk
 !!12k
 
u((	)k
 k
 k
  ^k
 k
 k
 k
 k
r;   r  zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                       e Zd ZdZdef fdZd Zd Zee		 	 	 	 	 	 	 	 	 	 	 	 dde
ej                 de
ej                 d	e
ej                 d
e
ej                 de
ej                 de
eeef                  de
e         de
e         de
e         de
e         de
ej                 de
ej                 deeef         fd                        Z xZS )DiaForConditionalGenerationr*   r)   c                 `   t                                          |           || _        t          |          | _        |j        j        | _        |j        j        | _        t          j	        |j        j
        | j        | j        z  d          | _        d| _        |                                  d S )NFrr   ForMaskedLM)rC   rD   r)   r  r*   r  rG   rF   r   r|   rH   logits_dense	loss_typer  r   s     r<   rD   z$DiaForConditionalGeneration.__init__  s       f%%
"1> /:I!-0ADO0S[`
 
 
 ' 	r;   c                 4    | j                                         S r	  )r*   r  r  s    r<   r  z'DiaForConditionalGeneration.get_encoder      z%%'''r;   c                 4    | j                                         S r	  )r*   get_decoderr  s    r<   r3  z'DiaForConditionalGeneration.get_decoder  r1  r;   Nr+   r   r  r  r  r  r   r  r   r   labelsr   rQ   c                     | j         d	|||||||||	|
|d|}|d         }|j        d         }|                     |                              |d| j        | j        f                              dd                                                              || j        z  d| j                  }d}| | j        d	||| j        d|}t          |||j
        |j        |j        |j        |j        |j        |j        	  	        S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in
            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
            are ignored (masked).
        )r+   r   r  r  r  r  r   r  r   r   r   r   rS   r!   r   N)logitsr4  rF   )	lossr6  r   r  r  r   r  r   r  r:   )r*   rY   r.  rX   rG   rF   r   r   loss_functionr   r   r  r  r   r  r   r  )rN   r+   r   r  r  r  r  r   r  r   r   r4  r   r   outputsr   r   audio_logitsr7  s                      r<   r]   z#DiaForConditionalGeneration.forward  sE   X $* 
)/!5#9++/!5)
 
 
 
 $AJ&,Q/
 /00T:r4#4doFGGYq!__Z\\T*t00"doFF 	 %4%o\&UYUdoohnooD#3")"?&9$5&-&G")"?&9

 

 

 
	
r;   )NNNNNNNNNNNN)r.   r/   r0   r2   r"   rD   r  r3  r   r   r   rJ   r   r   r   r   r	   r   r   r]   r`   ra   s   @r<   r+  r+    s         y      ( ( (( ( (  15598<;?=ACG9=$(,0/3-159R
 R
E,-R
 !!12R
 $E$45	R

 'u'78R
 !))9 :R
 "%(>"?@R
 ""56R
 D>R
 $D>R
 'tnR
 )*R
 !!12R
 
uo%	&R
 R
 R
  ^R
 R
 R
 R
 R
r;   r+  )r  r(   r+  )Dr^   typingr   r   r   rJ   r   cache_utilsr   r	   masking_utilsr
   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   llama.modeling_llamar   r   r   r   phi3.modeling_phi3r    configuration_diar"   r#   r$   generation_diar%   integrations.flex_attentionr&   
get_loggerr.   r   r(   rt   r>   rc   rg   ri   rk   r   r,   r   r-   r   r  r+  __all__r:   r;   r<   <module>rL     s     , , , , , , , , , ,        < < < < < < < < / / / / / /        C B B B B B 9 9 9 9 9 9            G F F F F F F F & & & & & & v v v v v v v v v v v v v v            ) ( ( ( ( ( L L L L L L L L L L . . . . . .  !! KJJJJJJ 
	H	%	% 	? 	? 	? 	? 	? 	? 	? 	?! ! ! ! !ry ! ! !8	 	 	 	 	W 	 	 		 	 	 	 	 	 	 		 	 	 	 	- 	 	 	^ ^ ^ ^ ^~ ^ ^ ^,G) G) G) G) G)	 G) G) G)T0 0 0 0 00 0 0 0BS S S S S# S S Sl8D 8D 8D 8D 8D0 8D 8D 8DvN& N& N& N& N&# N& N& N&b   
x
 x
 x
 x
 x
! x
 x
 
x
v   
l
 l
 l
 l
 l
"46H l
 l
 
l
^ L
K
Kr;   