
     `i                        d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1  e(j2        e3          Z4e e&d           G d de$                                  Z5 ed           G d dej6                              Z7 G d dej6                  Z8 G d  d!ej6                  Z9d" Z:dHd#Z;d$ej<        d%e=d&ej<        fd'Z>	 dId)ej6        d*ej<        d+ej<        d,ej<        d-eej<                 d.e?d/e?d0e"e%         fd1Z@ G d2 d3ej6                  ZA G d4 d5e          ZB e&d6          e& G d7 d8e                                   ZCe& G d9 d:eC                      ZD G d; d<ej6                  ZE e&d=           G d> d?eCe                      ZF G d@ dAej6                  ZGe& G dB dCeC                      ZH e&dD           G dE dFeCe1                      ZIg dGZJdS )J    )	dataclass)CallableOptionalUnionN)check_model_inputs   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )	AutoModel   )	CsmConfigCsmDepthDecoderConfig)CsmGenerationMixinz:
    Base class for the model autoregressive outputs.
    )custom_introc                      e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dZeej                 ed	<   dZeej                 ed
<   dZee         ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dZeej                 ed<   dS )CsmOutputWithPasta	  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the depth decoder model.
    depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
    depth_decoder_past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
    depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the backbone model.
    Nlosslogitspast_key_values.hidden_states
attentionsdepth_decoder_lossdepth_decoder_logitsdepth_decoder_past_key_valuesdepth_decoder_hidden_statesdepth_decoder_attentionsbackbone_loss)__name__
__module____qualname____doc__r&   r   torchFloatTensor__annotations__r'   r(   r
   r)   tupler*   r+   r,   r-   r.   r/   r0        x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/csm/modeling_csm.pyr%   r%   2   sK         8 )-D(5$
%,,,*.FHU&'...'+OXe_+++=AM8E%"3S"89:AAA:>Ju0#567>>>6:!23:::8<(5#45<<<59!8E?999KO%0A30F*G!HOOOHLhuU->-C'DELLL15M8E-.55555r:   r%   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )
CsmRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z9
        CsmRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parameterr5   onesweightvariance_epsilon)selfhidden_sizeeps	__class__s      r;   rB   zCsmRMSNorm.__init__d   sD     	l5:k#:#:;; #r:   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )Nr   T)keepdim)	dtypetor5   float32powmeanrsqrtrG   rF   )rH   r)   input_dtypevariances       r;   forwardzCsmRMSNorm.forwardl   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r:   c                 H    t          | j        j                   d| j         S )Nz, eps=)r8   rF   shaperG   rH   s    r;   
extra_reprzCsmRMSNorm.extra_reprs   s&    )**II$2GIIIr:   )r?   )r1   r2   r3   rB   rW   r[   __classcell__rK   s   @r;   r>   r>   b   sb        $ $ $ $ $ $; ; ;J J J J J J Jr:   r>   c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )CsmRotaryEmbeddinginv_freqNconfigc                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typetypedefaultr`   F
persistent)rA   rB   hasattr
isinstancerc   dictgetrd   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenra   r   rope_init_fnattention_scalingregister_bufferr`   original_inv_freq)rH   ra   devicer`   rK   s       r;   rB   zCsmRotaryEmbedding.__init__z   s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r:   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   rM   r   mpscpuF)device_typeenabledr   dim)rO   )r`   floatexpandrY   rP   rt   rj   re   strr5   autocast	transposecatcosrq   sinrO   )
rH   xposition_idsinv_freq_expandedposition_ids_expandedrx   freqsembr   r   s
             r;   rW   zCsmRotaryEmbedding.forward   s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/N)r1   r2   r3   r5   Tensorr7   r    rB   no_gradr   rW   r\   r]   s   @r;   r_   r_   w   s         l/ /y / / / / / /" U]__< <  _< < < < <r:   r_   c                   $     e Zd Z fdZd Z xZS )CsmMLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _	        t          j        | j        | j        |j                  | _
        t          |j                 | _        d S )Nbias)rA   rB   ra   rI   intermediate_sizerC   Linearmlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnrH   ra   rK   s     r;   rB   zCsmMLP.__init__   s    !-!'!94#3T5KRXRabbby!143IPVP_```4#94;KRXRabbbV./r:   c                     |                      |                     |                     |                    |                     |          z            }|S r   )r   r   r   r   )rH   r   r   s      r;   rW   zCsmMLP.forward   sA    NN4;;t~~a/@/@#A#ADLLQROO#STT	r:   r1   r2   r3   rB   rW   r\   r]   s   @r;   r   r      sG        0 0 0 0 0      r:   r   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..NrM   r   rz   )rY   r5   r   )r   x1x2s      r;   rotate_halfr      s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r:   c                     |                     |          }|                     |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r;   apply_rotary_pos_embr      sc    ( --
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0GGr:   r)   n_repreturnc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rY   r}   reshape)r)   r   batchnum_key_value_headsslenhead_dims         r;   	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr:           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr   r   rM   )r{   rO   )ptrainingr   )r   num_key_value_groupsr5   matmulr   rY   rC   
functionalsoftmaxrQ   rP   rO   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r;   eager_attention_forwardr      s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r:   c                       e Zd ZdZdedef fdZ eddd          	 	 dd
ej	        de
ej	        ej	        f         deej	                 dee         deej                 dee         de
ej	        ej	        f         fd            Z xZS )CsmAttentionz=Multi-headed attention from 'Attention Is All You Need' paperra   	layer_idxc                    t                                                       || _        || _        t	          |d|j        |j        z            | _        |j        |j        z  | _	        | j        dz  | _
        |j        | _        d| _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        |j        | j        z  |j                  | _        t          j        |j        | j        z  |j        |j                  | _        d S )Nr   g      Tr   )rA   rB   ra   r   getattrrI   num_attention_headsr   r   r   r   attention_dropout	is_causalrC   r   attention_biasq_projk_projv_projo_projrH   ra   r   rK   s      r;   rB   zCsmAttention.__init__   sB   "
F4F&Jd4dee$*$>&B\$\!}d*!'!9i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i&68JQWQf
 
 
r:   past_key_valuer(   4.58new_nameversionNr)   position_embeddingsr   cache_positionr   r   c                 D   |j         d d         }g |d| j        R }|                     |                              |                              dd          }	|                     |                              |                              dd          }
|                     |                              |                              dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|                    |
|| j	        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        sdn| j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )NrM   r   r   )r   r   r   eagerr   )r   r   )rY   r   r   viewr   r   r   r   updater   r   ra   _attn_implementationr   r   r   r   r   r   r   )rH   r)   r   r   r(   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r;   rW   zCsmAttention.forward  s    $)#2#.88b8$-88{{=1166|DDNNqRSTT[[//44\BBLLQPQRR
{{=1166|DDNNqRSTT&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r:   )NN)r1   r2   r3   r4   r    intrB   r   r5   r   r8   r   r
   
LongTensorr   r   rW   r\   r]   s   @r;   r   r      s       GG
y 
S 
 
 
 
 
 
. _%0A6RRR ,059)) ))|)) #5<#=>)) !.	))
 "%)) !!12)) +,)) 
u|U\)	*)) )) )) SR)) )) )) )) ))r:   r   c                   4    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	ej
                 de	eej        ej        f                  dee         dej        fd            Z xZS )CsmDecoderLayerra   r   c                 4   t                                                       |j        | _        t          ||          | _        t          |          | _        t          |j        |j                  | _	        t          |j        |j                  | _
        d S )N)ra   r   rJ   )rA   rB   rI   r   	self_attnr   mlpr>   rms_norm_epsinput_layernormpost_attention_layernormr   s      r;   rB   zCsmDecoderLayer.__init__;  s    !-%VyIII&>>)&*<&BUVVV(263E6K^(_(_(_%%%r:   r   r(   r   r   NFr)   r   r   	use_cacher   r   r   r   c                     |}	|                      |          } | j        d|||||||d|\  }}
|	|z   }|}	|                     |          }|                     |          }|	|z   }|S )N)r)   r   r   r(   r   r   r   r9   )r   r   r   r   )rH   r)   r   r   r(   r   r   r   r   residual_s              r;   rW   zCsmDecoderLayer.forwardE  s     !,,];;)4> 	
')%+) 3	
 	
 	
 	
q !=0 !55mDD// =0r:   )NNNFNN)r1   r2   r3   r    r   rB   r   r5   r   r   r   r
   boolr8   r   r   rW   r\   r]   s   @r;   r   r   :  s5       `y `S ` ` ` ` ` ` _%0A6RRR 2637+/$)59KO | !. u/0	
 "% D> !!12 &eEL%,,F&GH +, 
   SR    r:   r   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    c                   X     e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZeedZ fdZ xZS )CsmPreTrainedModelra   modelTr   r(   )r)   r*   c                    t                                          |           t          |t                    rM|j        }t          |dz
            D ]5}|j        j        |                             d| j	        j
                   4d S d S )Nr   r   )rS   std)rA   _init_weightsrj   CsmCodebooksHeadnum_codebooksrangerF   datanormal_ra   initializer_range)rH   r   r   irK   s       r;   r   z CsmPreTrainedModel._init_weights  s    f%%%f.// 	["0M=1,-- [ ["1%--3DK<Y-ZZZZ	[ 	[[ [r:   )r1   r2   r3   r    r7   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr   r\   r]   s   @r;   r   r   h  s          &*#*+#4"5N ""&(" 
[ [ [ [ [ [ [ [ [r:   r   c                   D    e Zd ZU eed<    fdZee	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j                 dee	j
                 dee         d	ee	j                 d
ee         dee	j
                 dee         deeef         fd                        Z xZS )CsmDepthDecoderModelra   c                 .   t                                                     j        | _        j        | _        t          j        j        j        z  j                  | _	        t          j
        fdt          j                  D                       | _        t          j        j                  | _        t%                    | _        d| _        t          j        j        j        d          | _        |                                  d S )Nc                 0    g | ]}t          |          S r9   r   .0r   ra   s     r;   
<listcomp>z1CsmDepthDecoderModel.__init__.<locals>.<listcomp>  #    aaaI_VY//aaar:   r   ra   Fr   )rA   rB   pad_token_idpadding_idx
vocab_sizerC   	Embeddingr   backbone_hidden_sizeembed_tokens
ModuleListr  num_hidden_layerslayersr>   rI   r   normr_   
rotary_embgradient_checkpointingr   inputs_embeds_projector	post_initr   s    `r;   rB   zCsmDepthDecoderModel.__init__  s       !. +L&*>AR*RU[UpqqmaaaavG_A`A`aaa
 
 v1v7JKKK	,F;;;&+#')y1LfN`gl'm'm'm$ 	r:   N	input_idsbackbone_last_hidden_stater   r   r(   inputs_embedsr   r   r   r   c	                    |:t           j                                        st                              d           d}|du |duz  rt          d          |r|t          | j                  }|^||                                nd}
||j	        d         n|j	        d         }||j
        n|j
        }t          j        |
|
|z   |          }|t          j        |dz
  d          }|| j        z  }|                     ||z             }|d         dk    }|
||dddf<   n:t           j                                        s|rt                              d	           |                     |          }t#          | j        |||||
          }|}|                    d          }|                     ||          }| j        d| j        j                 D ]} ||f||||||d|	}|                     |          }t/          ||r|nd          S )aJ  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        NzCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids from `cache_position` and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.r  r   r   rt   )minzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.ra   input_embedsr   r   r(   r   )r   r   r(   r   r   r   last_hidden_stater(   )r5   compileris_compilingloggerwarning_once
ValueErrorr   ra   get_seq_lengthrY   rt   arangeclampr  r  warningr%  r   r   r#  r!  r   r"  r   )rH   r'  r(  r   r   r(   r)  r   r   r   past_seen_tokensinputs_seq_lengthrt   codebook_idxsoffsetinput_ids_are_first_codebookr   r)   r   decoder_layers                       r;   rW   zCsmDepthDecoderModel.forward  s   & #EN,G,G,I,I#M    L-t";< 	\Z[[[ 	?0*$+>>>O!CRC^==???de:G:S 3A 6 6YbYhijYk-:-F]))IL\F"\*:<LO`<`iopppN !K(:BBBM"T_4F --i&.@AAM+9!+<+A()5&@aaad##~2244 9U NN Q   44]CC(;&))+%
 
 
 & &//22"oom\JJ![)H4;+H)HI 
	 
	M)M	*) /#-$7	 	 	 	MM 		-00&+/8BOOd
 
 
 	
r:   )NNNNNNNN)r1   r2   r3   r!   r7   rB   r   r   r   r5   r   r6   r   r
   r   r   r   r   r8   r   rW   r\   r]   s   @r;   r  r    sM        !!!!       15BF1537+/59$(59R
 R
E,-R
 %-U->$?R
 !.	R

 u/0R
 "%R
   12R
 D>R
 !!12R
 +,R
 
u--	.R
 R
 R
 ^ R
 R
 R
 R
 R
r:   r  c                   &     e Zd Z fdZddZ xZS )r   c                     t                                                       || _        t          j        t          j        | j        dz
  ||                    | _        d S Nr   )rA   rB   r   rC   rD   r5   emptyrF   )rH   rI   r   r  rK   s       r;   rB   zCsmCodebooksHead.__init__  sM    *l5;t/AA/E{T^#_#_``r:   Nc                    |-j         d         }| j        t          j        |                   n|dz
  }| j        |         fdt	          j         d                   D             t          j        d          S )Nr   c           	          g | ]:}t           j                            d d |d d f         |         j                  ;S r   )rC   r   linearT)r  codebook_idxcodebook_weightr)   s     r;   r  z,CsmCodebooksHead.forward.<locals>.<listcomp>  sX     
 
 
 M  qqq,/A!BOT`DaDcdd
 
 
r:   r   rz   )rY   rF   r5   r7  r  stack)rH   r)   r   
seq_lengthr<  rI  s    `   @r;   rW   zCsmCodebooksHead.forward  s    !&,Q/J"k%,z*B*BCOO*Q.M"k-8O
 
 
 
 
 %o&;A&> ? ?
 
 
 Mq999r:   r   r   r]   s   @r;   r   r     sQ        a a a a a
       r:   r   a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c                   (    e Zd ZdZdZdZ fdZee	 	 	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 de	e
j                 de	eeee
j                 f                  d	e	e
j                 d
e	e
j                 de	e         de	e
j                 deee
j        f         dee         deeef         fd                        Z	 	 	 	 dde
j        de	e         de	e
j                 d	e	e
j                 de	e
j                 f
 fdZ xZS )CsmDepthDecoderForCausalLMNc                     t                                          |           t          |          | _        |j        | _        t          |j        |j        |j                  | _        | 	                                 d S r   )
rA   rB   r  r   r  r   rI   r   codebooks_headr&  r   s     r;   rB   z#CsmDepthDecoderForCausalLM.__init__  sj       )&11
 +.v/A6CWY_Yjkk 	r:   r   r'  r(  r   r   r(   r)  labelsr   r   logits_to_keepr   r   c                     | j         d||||||||	d|}|d         }t          |
t                    r)|
dk    rt          dd          }nt          |
 d          }n|
}|                     |dd|ddf         |	|	|         nd          }|                                }d}|:|dddf                                         } | j        d|d| j        j        |d|}t          |||j
        |j        |j                  S )	a  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r'  r(  r   r   r(   r)  r   r   r   r   N.)r'   rP  r  shift_labels)r&   r'   r(   r)   r*   r9   )r   rj   r   slicerO  r   loss_functionra   r  r   r(   r)   r*   )rH   r'  r(  r   r   r(   r)  rP  r   r   rQ  r   outputsr)   slice_indicesr'   r&   rS  s                     r;   rW   z"CsmDepthDecoderForCausalLM.forward  sr   2 $* 

'A)%+')

 

 

 

  
nc** 	+"" %a %~ot < <*M$$!!!]AAA-.Q_Qk}0M0Mqu
 
 ""$$!#qrr'?5577L%4% dt{7M\h lr D &#3!/)
 
 
 	
r:   c                      t                      j        |||||fi |}|d         d         dk    }|s|                    d           |                    d           |S )Nr   r   r(  r   )rA   prepare_inputs_for_generationpop)
rH   r'  r(   r   r)  r   r   model_inputsis_first_generation_steprK   s
            r;   rY  z8CsmDepthDecoderForCausalLM.prepare_inputs_for_generationc  s     =uww<~
 
Y_
 
 $00@#A!#D#I ' 	;9::: 	(((r:   )
NNNNNNNNNr   NNNN)r1   r2   r3   _tied_weights_keys_tp_plan_pp_planrB   r   r   r   r5   r   r6   r   r   r
   listr   r   r   r   r8   r   rW   rY  r\   r]   s   @r;   rM  rM  
  s        HH      15BF1537KO59-1$(5934@
 @
E,-@
 %-U->$?@
 !.	@

 u/0@
 "%tE4E/F(F"GH@
   12@
 )*@
 D>@
 !!12@
 c5</0@
 +,@
 
u,,	-@
 @
 @
 ^ @
J ,0595959 # "% !!12	
   12 !!12         r:   rM  c                   $     e Zd Z fdZd Z xZS )CsmBackboneModelEmbeddingsc                    t                                                       t          j        |j        |j        z  |j                  | _        |                     dt          j
        |j                  |j        z  d           d S )Naudio_tokens_offsetsFrg   )rA   rB   rC   r  r   r  rI   embed_audio_tokensrr   r5   r7  r   s     r;   rB   z#CsmBackboneModelEmbeddings.__init__{  s    "$,0DvGX0X[a[m"n"n"EL1E$F$FIZ$Zgl 	 	
 	
 	
 	
 	
r:   c                 l    |                      || j        z             }|                    d          }|S )Nr   rz   )rf  re  sum)rH   r'  r.  s      r;   rW   z"CsmBackboneModelEmbeddings.forward  s9    ..y4;T/TUU#''A'..r:   r   r]   s   @r;   rc  rc  z  sG        
 
 
 
 
      r:   rc  c                       e Zd Z fdZee	 	 	 	 	 	 	 ddeej                 deej	                 deej                 dee
         deej                 deej                 d	ee         d
ee         defd                        Z xZS )CsmBackboneModelc                    t                                                     j        | _        j        | _        t                    | _        t          j        fdt          j
                  D                       | _        t          j        j                  | _        t!                    | _        d| _        |                                  d S )Nc                 0    g | ]}t          |          S r9   r  r  s     r;   r  z-CsmBackboneModel.__init__.<locals>.<listcomp>  r  r:   r   r  F)rA   rB   r  r  r  rc  r  rC   r  r  r   r!  r>   rI   r   r"  r_   r#  r$  r&  r   s    `r;   rB   zCsmBackboneModel.__init__  s       !. +6v>>maaaavG_A`A`aaa
 
 v1v7JKKK	,F;;;&+# 	r:   Nr'  r   r   r(   r)  r   r   r   r   c           
      N   |du |duz  rt          d          ||                     |          }|r|t          | j                  }|B||                                nd}	t          j        |	|	|j        d         z   |j                  }||	                    d          }t          | j        |||||          }
|}|                     ||          }| j        d| j        j                 D ]} ||f|
||||d|}|                     |          }t          ||	          S )
a&  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nz:You must specify exactly one of input_ids or inputs_embedsr  r   r   r+  r-  )r   r   r(   r   r   r/  )r5  r  r   ra   r6  r5   r7  rY   rt   r   r   r#  r!  r   r"  r   )rH   r'  r   r   r(   r)  r   r   r   r:  r   r)   r   r?  s                 r;   rW   zCsmBackboneModel.forward  s   2 -t";< 	[YZZZ *.*;*;I*F*FM 	?0*$+>>>O!CRC^==???de+0< "2]5H5K"KTaTh, , ,N )33A66L(;&))+%
 
 
 &"oom\JJ![)H4;+H)HI 		 		M)M*) /-$7   MM 		-00&++
 
 
 	
r:   )NNNNNNN)r1   r2   r3   rB   r   r   r   r5   r   r   r
   r6   r   r   r   r   rW   r\   r]   s   @r;   rj  rj    s             151537+/5959$(D
 D
E,-D
 !.D
 u/0	D

 "%D
   12D
 !!12D
 D>D
 +,D
 
!D
 D
 D
 ^ D
 D
 D
 D
 D
r:   rj  z
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c                   
    e Zd ZddgZ fdZd Zd Zd Ze fd            Z	 fdZ
	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 f
dZ	 	 	 	 dd
ej        dee         deej                 deej                 deej                 f
 fdZee	 	 	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deeeeej                 f                  deej                 deej                 dee         deej                 deeej        f         dee         deeef         fd                        Z xZS )CsmForConditionalGenerationz5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                    t                                          |           |j        | _        t          j        |j        |j        d          | _        t          j        |j        |j                  | _	        t                              |          | _        t                              |j                  | _        t!          j        |j                  | _        |                                  d S )NFr   )rA   rB   r  rC   r   rI   lm_headr  text_vocab_sizeembed_text_tokensrj  _from_configbackbone_modelrM  depth_decoder_configdepth_decoderr   from_configcodec_configcodec_modelr&  r   s     r;   rB   z$CsmForConditionalGeneration.__init__  s        +y!3V5FUSSS!#f.DfFX!Y!Y.;;FCC7DDVE`aa$01DEEr:   c                     | j         j        S r   ru  r  rZ   s    r;   get_input_embeddingsz0CsmForConditionalGeneration.get_input_embeddings  s    "//r:   c                     || j         _        d S r   r|  )rH   r   s     r;   set_input_embeddingsz0CsmForConditionalGeneration.set_input_embeddings  s    +0(((r:   c                     | j         j        r6|                     | j        j        j        | j        j        j                   d S d S r   )ra   tie_codebooks_embeddings_tie_or_clone_weightsru  r  rf  rw  r   rZ   s    r;   _tie_weightsz(CsmForConditionalGeneration._tie_weights  sS    ;/ 	&&#0C"(5    	 	r:   c                    |                     dd          r t                      j        |i |\  }}n t                      j        |i |}dt                    fdt	          |j                                                  D             }t	          |j        j                                      ddi|           |D ]}t          |j        |z              d|v r||fS |S )Noutput_loading_infoFdepth_decoder_c                 V    i | ]%\  }}|                               |d          |&S r   )
startswith)r  attrr   prefix
prefix_lens      r;   
<dictcomp>z?CsmForConditionalGeneration.from_pretrained.<locals>.<dictcomp>  sJ     
 
 
ev&&
u
 
 
r:   _from_model_config)
rl   rA   from_pretrainedlenvarsgeneration_configitemsrw  r   delattr)
clsargsr   r   loading_infodepth_decoder_attrsr  r  r  rK   s
          @@r;   r  z+CsmForConditionalGeneration.from_pretrained  s'   ::+U33 	="9%''"94"J6"J"JE<<+EGG+T<V<<E "[[

 
 
 
 
#E$;<<BBDD
 
 
 	U 233::<PRW;o[n;oppp ( 	< 	<DE+Vd];;;; F**,&&Lr:   c                    d}| j         j                                        }|                    dd            |                                D ]\  }}t          | j        ||z   |            t                      j        |i | d S )Nr  transformers_version)rw  r  to_diff_dictrZ  r  setattrrA   save_pretrained)rH   r  r   r  r  r  r   rK   s          r;   r  z+CsmForConditionalGeneration.save_pretrained  s    !"0BOOQQ 6===.4466 	B 	BKD%D*FTM5AAAA000000r:   Nr'  input_valuesinput_values_cutoffsrP  r   c                    |                      |          }|Lt          j                            |d          }||dk                                             }||dk             }t          j        |                                |j                  	                    t          |          d          }||                    d          k     }t          j                    5  g }t          ||          D ]\  }	}
|
|
dk             }
t          |
j        d         dz
            D ]}|
|         }|
|dz            }|	d||f         }| j                            |                    d                    }|j                            dd          }|                    |d                    t          d |D                       t          j        fd	|D                       }| j                            |          }ddd           n# 1 swxY w Y   | j        j        }||k    }| j                            |          }||         ||<   t          j        dd| j        j        f|j        t
          j        
          | j        j        z  }| j                            |                              d          }|| j        j         k    }|!                    |"                                d          ||<   |v|                    d          !                    dd| j        j                  }||         ||<   |||<   |dk    #                    d          }d||d         |d         ddf<   |}||dS )a  
        Merges the input_ids and input_values to produce a single inputs_embeds tensor:
        1 - Infers the codec model on the input_values to retrieve codebook token.
        2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
        3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

        Args:
            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
                The input ids to embed.
            input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
                The audio input values to embed.
            input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
                The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
        Nr   r   r   r+  rM   r   .c              3   0   K   | ]}|j         d          V  dS )r   N)rY   )r  els     r;   	<genexpr>zQCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>Y  s(      &O&Orrx{&O&O&O&O&O&Or:   c                 t    g | ]4}t           j                            |d d d |j        d          z
  f          5S )r   )rC   r   padrY   )r  r  max_audio_framess     r;   r  zRCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<listcomp>[  sB    rrrZ\R]&&rAq!5EQR5S+TUUrrrr:   )rt   rO   iTas_tuple)r)  rP  )$rs  rC   r   r  diffr5   r7  maxrt   r}   r  r   r   zipr  rY   rz  encodeaudio_codesr   appendrJ  get_audio_codes_maskra   audio_token_idru  r  rE   r   longcodebook_eos_token_idsqueezeaudio_eos_token_idrepeatrh  nonzero)rH   r'  r  r  rP  r)  audio_lengthsinput_values_maskaudio_tokens_listbatch_input_valuesbatch_input_values_cutoffsr  	start_idxend_idxaudio_batchcodec_outputscodebook_idsbatched_audio_token_idsaudio_codes_maskr  audio_token_maskaudio_embedsaudio_eos_frame_idsaudio_eos_embedsaudio_eos_token_masklabels_expanded depth_decoder_ignore_frames_idxsr  s                              @r;   "_merge_input_ids_with_input_valuesz>CsmForConditionalGeneration._merge_input_ids_with_input_values)  s   * ..y99##%=#4#45I6#R#R 01E1JKPPRRM)-!*;<M %-A-E-E-G-GP\Pc d d d k kM""B! ! !2M4K4KA4N4N N
  \ \$&!FI,XlFmFm B BB&(B1KLfjkLk1l."#=#CA#F#JKK B B$>q$A	"<QU"C&8i>O9O&P(,(8(?(?@U@UVW@X@X(Y(Y'4'@'J'J1b'Q'Q)00aAAAAB $'&O&O=N&O&O&O#O#O */+rrrr`qrrr+ +' $(#3#H#HIZ#[#[ !\ \ \ \ \ \ \ \ \ \ \ \ \ \ \$ "[7N(N:.;;<STTL.:;K.LM*+ 
Aq$+";<YEU]b]ghhh+34    $2??@STT\\]^__#,0N#N 2B2I2IJ^JbJbJdJdfg2h2hM./ !"("2"22"6"6"="=aDKD]"^"^4KL\4] 018K 454:dN3K3KUY3K3Z3Z0pt @ CEefgEhjkjljl lm(!.&AAAs   DHHHr(   r   r)  r   c           	      x    t                      j        d	|||||d|}||j        dk    r|                    d          w|                     ||                    d          |                    d          |                    d                    }|                    |d         |d         d d           |S )
N)r'  r(   r   r)  r   r   r)  r  r  rP  )r'  r  r  rP  )r)  rP  r'  r9   )rA   rY  ndimrl   r  r   )
rH   r'  r(   r   r)  r   r   r[  merged_inputsrK   s
            r;   rY  z9CsmForConditionalGeneration.prepare_inputs_for_generation{  s     =uww< 
+)')
 
 
 
  Y^q%8%8\=M=Mo=^=^=f CC##ZZ77%+ZZ0F%G%Gzz(++	 D  M "/"@MZbLcrvww   r:   r   r   r   rQ  r   c                    |5|j         dk    r*|                     ||||          }|d         }|d         }d} | j        d||||||	|
d|}|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}d}d}d}||dddddf         } | j        d||| j        j	        d|}|ddddddf         d	k    
                    d
           }||         dd| j        j        dz
  f         }t          j                            |dd          }|                    d          }||d         |d         dz
  ddf         }||         } | j        d|||	d|d|}|j        }||z   }t%          |||||j        |j        |j        ||j        nd||j        nd||j        nd||j        nd          S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
            Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
            If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
            where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
            the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
            Requires targeted `input_values` to be provided as audio tokens will be inferred from it using the `codec_model`.
            - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
            - `-100` will be ignored in the loss computation
            - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

            Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            Kept for compatibility. Does not support another value than:
            1. `0`, which is equivalent to keeping all logits, used in the training regime
            2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

        Example:

        ```python
        >>> import torch
        >>> from transformers import CsmForConditionalGeneration, AutoProcessor
        >>> from datasets import load_dataset, Audio

        >>> model_id = "sesame/csm-1b"
        >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

        >>> processor = AutoProcessor.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        >>> # ensure the audio is 24kHz
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

        >>> conversation = []
        >>> # prepare a conversation with text and corresponding audio
        >>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
        ...     conversation.append(
        ...         {
        ...             "role": f"{speaker_id}",
        ...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
        ...         }
        ...     )

        >>> inputs = processor.apply_chat_template(
        ...     conversation,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     output_labels=True,
        ... ).to(torch_device)

        >>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
        >>> output = model(**inputs)
        >>> output.loss.backward()
        ```Nr   r)  rP  )r'  r   r   r(   r)  r   r   r   )r'   rP  r  r   r  rM   rz   .r  )r   Tr  )r'  r(  r   return_dictrP  )r&   r0   r+   r'   r(   r)   r*   r,   r-   r.   r/   r9   )r  r  ru  rj   r   rT  rq  rU  ra   r  allr   rC   r   r  r  rw  r&   r%   r(   r)   r*   r'   )rH   r'  r  r   r  r   r(   r)  rP  r   r   rQ  r   r  backbone_outputsbackbone_hidden_statesrW  backbone_logitsr&   r0   r+   depth_decoder_outputsbackbone_labels
train_maskdepth_decoder_input_ids
train_idxsbackbone_last_hidden_statesdepth_decoder_labelss                               r;   rW   z#CsmForConditionalGeneration.forward  s   f  Y^q%8%8 CC<)=v M */:M"8,FI.4. 	
)%+')	
 	
 	
 	
 "2!!48B>SV8W8Wk~ot444]k,,'=aaaPQPQPQ>Q'RSS! $$QQQ1WoO.D. &4;Ka ek M "!!!QQQ(+t388R8@@@J&,Z&8>]@Y\]@]>]9]&^#&(m&7&78OQW_`&7&a&a##++T+::J*@APZ[\P]`aPacdcdcdAd*e'#)*#5 $6D$6 %1+F# +% % % %! "7!; #55D '1",<*8'2AVAb!6!=!=hl$0 +@*O*O$0 )>(K(KI^Ij%:%E%Ept
 
 
 	
r:   r]  )NNNNNNNNNNr   )r1   r2   r3   r^  rB   r}  r  r  classmethodr  r  r   r5   r   r  r   r
   r6   rY  r   r   r   ra  r   r   r   r   r8   r%   rW   r\   r]   s   @r;   ro  ro    s!        	@1
    0 0 01 1 1       [41 1 1 1 1 -1/37;)-PB PBEL)PB u|,PB 'u|4	PB
 &PB 
%,	PB PB PB PBj ,0595959 # "% !!12	
   12 !!12     >  15/3157;37KO59-1$(5934[
 [
E,-[
 u|,[
 !.	[

 'u|4[
 u/0[
 "%tE4E/F(F"GH[
   12[
 )*[
 D>[
 !!12[
 c5</0[
 +,[
 
u''	([
 [
 [
 ^ [
 [
 [
 [
 [
r:   ro  )r   rj  r  rM  ro  rB  )r   )Kdataclassesr   typingr   r   r   r5   torch.nnrC   transformers.utils.genericr   activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr   autor   configuration_csmr    r!   generation_csmr"   
get_loggerr1   r3  r%   Moduler>   r_   r   r   r   r   r   r   r|   r   r   r   r   r  r   rM  rc  rj  ro  __all__r9   r:   r;   <module>r     s  , " ! ! ! ! ! , , , , , , , , , ,        9 9 9 9 9 9 ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) 7 7 7 7 7 7 / / / / / / 9 9 9 9 9 9 O O O O O O O O K K K K K K K K F F F F F F F F & & & & & & _ _ _ _ _ _ _ _ _ _ _ _ _ _ 0 0 0 0 0 0       ? ? ? ? ? ? ? ? . . . . . . 
	H	%	%   
'6 '6 '6 '6 '6 '6 '6  '6T Y''J J J J J J J ('J(!< !< !< !< !< !< !< !<H    RY    ( ( (   6	UU\ 	U# 	U%, 	U 	U 	U 	U& % %I%<% 
% <	%
 U\*% % % '(% % % %4D) D) D) D) D)29 D) D) D)N+ + + + +0 + + +\   
 [ [ [ [ [ [ [  [4 g
 g
 g
 g
 g
- g
 g
 g
T    ry   .   f f f f f!3_ f f fR        V
 V
 V
 V
 V
) V
 V
 V
r   
P
 P
 P
 P
 P
"46H P
 P
 
P
f
  r:   