
     `i|                     x   d dl mZ d dlmZmZ d dlZd dlmZ d dlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z,  ej-        e.          Z/e ed           G d de                                  Z0 G d de%          Z1 G d de&          Z2 G d de#          Z3 G d de           Z4 G d d e!          Z5 ed!          e G d" d#e                                  Z6e G d$ d%e$e6                      Z7 G d& d'ej8                  Z9 ed(           G d) d*e"e                      Z: G d+ d,ej8                  Z;e G d- d.e$                      Z< ed/           G d0 d1e6e,                      Z=g d2Z>dS )3    )	dataclass)OptionalUnionN)check_model_inputs   )CacheDynamicCache)GenerationMixin)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)ModelOutputauto_docstringcan_return_tuplelogging   )	AutoModel)LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRMSNormLlamaRotaryEmbeddingTransformersKwargs   )	CsmConfigCsmDepthDecoderConfig)CsmGenerationMixinz:
    Base class for the model autoregressive outputs.
    )custom_introc                      e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dZeej                 ed	<   dZeej                 ed
<   dZee         ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dZeej                 ed<   dS )CsmOutputWithPasta	  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the depth decoder model.
    depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
    depth_decoder_past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
    depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the backbone model.
    Nlosslogitspast_key_values.hidden_states
attentionsdepth_decoder_lossdepth_decoder_logitsdepth_decoder_past_key_valuesdepth_decoder_hidden_statesdepth_decoder_attentionsbackbone_loss)__name__
__module____qualname____doc__r%   r   torchFloatTensor__annotations__r&   r'   r   r(   tupler)   r*   r+   r,   r-   r.   r/        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/csm/modular_csm.pyr$   r$   1   sK         8 )-D(5$
%,,,*.FHU&'...'+OXe_+++=AM8E%"3S"89:AAA:>Ju0#567>>>6:!23:::8<(5#45<<<59!8E?999KO%0A30F*G!HOOOHLhuU->-C'DELLL15M8E-.55555r9   r$   c                       e Zd ZdS )
CsmRMSNormNr0   r1   r2   r8   r9   r:   r<   r<   b           Dr9   r<   c                       e Zd ZdS )CsmRotaryEmbeddingNr=   r8   r9   r:   r@   r@   f   r>   r9   r@   c                       e Zd ZdS )CsmMLPNr=   r8   r9   r:   rB   rB   j   r>   r9   rB   c                       e Zd ZdS )CsmAttentionNr=   r8   r9   r:   rD   rD   n   r>   r9   rD   c                       e Zd ZdS )CsmDecoderLayerNr=   r8   r9   r:   rF   rF   r   r>   r9   rF   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    c                   X     e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZeedZ fdZ xZS )CsmPreTrainedModelconfigmodelTrF   r'   )r(   r)   c                    t                                          |           t          |t                    rM|j        }t          |dz
            D ]5}|j        j        |                             d| j	        j
                   4d S d S )Nr   g        )meanstd)super_init_weights
isinstanceCsmCodebooksHeadnum_codebooksrangeweightdatanormal_rI   initializer_range)selfmodulerR   i	__class__s       r:   rO   z CsmPreTrainedModel._init_weights   s    f%%%f.// 	["0M=1,-- [ ["1%--3DK<Y-ZZZZ	[ 	[[ [r9   )r0   r1   r2   r   r6   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendrF   rD   _can_record_outputsrO   __classcell__r[   s   @r:   rH   rH   v   s          &*#*+#4"5N ""&(" 
[ [ [ [ [ [ [ [ [r9   rH   c                   D    e Zd ZU eed<    fdZee	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j                 dee	j
                 dee         d	ee	j                 d
ee         dee	j
                 dee         deeef         fd                        Z xZS )CsmDepthDecoderModelrI   c                     t                                          |           t          j        |j        |j        z  |j                  | _        t          j        |j        |j	        d          | _
        d S NF)bias)rN   __init__nn	EmbeddingrR   
vocab_sizebackbone_hidden_sizeembed_tokensLinearhidden_sizeinputs_embeds_projectorrX   rI   r[   s     r:   rl   zCsmDepthDecoderModel.__init__   sd       L&*>AR*RU[Upqq')y1LfN`gl'm'm'm$$$r9   N	input_idsbackbone_last_hidden_stateattention_maskposition_idsr'   inputs_embeds	use_cachecache_positionkwargsreturnc	                    |:t           j                                        st                              d           d}|du |duz  rt          d          |r|t          | j                  }|^||                                nd}
||j	        d         n|j	        d         }||j
        n|j
        }t          j        |
|
|z   |          }|t          j        |dz
  d          }|| j        z  }|                     ||z             }|d         dk    }|
||dddf<   n:t           j                                        s|rt                              d	           |                     |          }t#          | j        |||||
          }|}|                    d          }|                     ||          }| j        d| j        j                 D ]} ||f||||||d|	}|                     |          }t/          ||r|nd          S )aJ  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        NzCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids from `cache_position` and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.)rI   r   r   device)minzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.)rI   input_embedsrx   r|   r'   ry   )rx   ry   r'   r{   r|   position_embeddings)last_hidden_stater'   )r4   compileris_compilingloggerwarning_once
ValueErrorr	   rI   get_seq_lengthshaper   arangeclampro   rq   warningrt   r   	unsqueeze
rotary_emblayersnum_hidden_layersnormr   )rX   rv   rw   rx   ry   r'   rz   r{   r|   r}   past_seen_tokensinputs_seq_lengthr   codebook_idxsoffsetinput_ids_are_first_codebookcausal_maskr(   r   decoder_layers                       r:   forwardzCsmDepthDecoderModel.forward   s   & #EN,G,G,I,I#M    L-t";< 	\Z[[[ 	?0*$+>>>O!CRC^==???de:G:S 3A 6 6YbYhijYk-:-F]))IL\F"\*:<LO`<`iopppN !K(:BBBM"T_4F --i&.@AAM+9!+<+A()5&@aaad##~2244 9U NN Q   44]CC(;&))+%
 
 
 & &//22"oom\JJ![)H4;+H)HI 
	 
	M)M	*) /#-$7	 	 	 	MM 		-00&+/8BOOd
 
 
 	
r9   )NNNNNNNN)r0   r1   r2   r    r6   rl   r   r   r   r4   
LongTensorr5   Tensorr   boolr   r   r   r7   r   r   re   rf   s   @r:   rh   rh      sR        !!!!n n n n n
  15BF1537+/59$(59R
 R
E,-R
 %-U->$?R
 !.	R

 u/0R
 "%R
   12R
 D>R
 !!12R
 +,R
 
u--	.R
 R
 R
 ^ R
 R
 R
 R
 R
r9   rh   c                   &     e Zd Z fdZddZ xZS )rQ   c                     t                                                       || _        t          j        t          j        | j        dz
  ||                    | _        d S )Nr   )rN   rl   rR   rm   	Parameterr4   emptyrT   )rX   rs   rR   ro   r[   s       r:   rl   zCsmCodebooksHead.__init__   sM    *l5;t/AA/E{T^#_#_``r9   Nc                    |-j         d         }| j        t          j        |                   n|dz
  }| j        |         fdt	          j         d                   D             t          j        d          S )Nr   c           	          g | ]:}t           j                            d d |d d f         |         j                  ;S N)rm   
functionallinearT).0codebook_idxcodebook_weightr(   s     r:   
<listcomp>z,CsmCodebooksHead.forward.<locals>.<listcomp>  sX     
 
 
 M  qqq,/A!BOT`DaDcdd
 
 
r9   r   dim)r   rT   r4   r   rS   stack)rX   r(   r|   
seq_lengthr   r   s    `   @r:   r   zCsmCodebooksHead.forward   s    !&,Q/J"k%,z*B*BCOO*Q.M"k-8O
 
 
 
 
 %o&;A&> ? ?
 
 
 Mq999r9   r   r0   r1   r2   rl   r   re   rf   s   @r:   rQ   rQ      sQ        a a a a a
       r9   rQ   a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c                   (    e Zd ZdZdZdZ fdZ	 	 	 	 ddej        de	e
         de	ej                 de	ej                 de	ej                 f
 fdZee	 	 	 	 	 	 	 	 	 	 dde	ej                 d
e	ej                 de	ej                 de	ej                 de	ee
eej                 f                  de	ej                 de	ej                 de	e         de	ej                 deeej        f         dee         deeef         fd                        Z xZS )CsmDepthDecoderForCausalLMNc                     t                                          |           | `t          |j        |j        |j                  | _        t          |          | _	        d S r   )
rN   rl   lm_headrQ   rs   rR   ro   codebooks_headrh   rJ   ru   s     r:   rl   z#CsmDepthDecoderForCausalLM.__init__  sQ       L.v/A6CWY_Yjkk)&11


r9   rv   r'   rx   rz   r|   c                      t                      j        |||||fi |}|d         d         dk    }|s|                    d           |                    d           |S )Nr|   r   rw   ry   )rN   prepare_inputs_for_generationpop)
rX   rv   r'   rx   rz   r|   r}   model_inputsis_first_generation_stepr[   s
            r:   r   z8CsmDepthDecoderForCausalLM.prepare_inputs_for_generation  s     =uww<~
 
Y_
 
 $00@#A!#D#I ' 	;9::: 	(((r9   r   rw   ry   labelsr{   logits_to_keepr}   r~   c                     | j         d||||||||	d|}|d         }t          |
t                    r)|
dk    rt          dd          }nt          |
 d          }n|
}|                     |dd|ddf         |	|	|         nd          }|                                }d}|:|dddf                                         } | j        d|d| j        j        |d|}t          |||j
        |j        |j                  S )	a  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )rv   rw   rx   ry   r'   rz   r{   r|   r   r   N.)r&   r   ro   shift_labels)r%   r&   r'   r(   r)   r8   )rJ   rP   intslicer   
contiguousloss_functionrI   ro   r   r'   r(   r)   )rX   rv   rw   rx   ry   r'   rz   r   r{   r|   r   r}   outputsr(   slice_indicesr&   r%   r   s                     r:   r   z"CsmDepthDecoderForCausalLM.forward5  sr   2 $* 

'A)%+')

 

 

 

  
nc** 	+"" %a %~ot < <*M$$!!!]AAA-.Q_Qk}0M0Mqu
 
 ""$$!#qrr'?5577L%4% dt{7M\h lr D &#3!/)
 
 
 	
r9   NNNN)
NNNNNNNNNr   )r0   r1   r2   _tied_weights_keys_tp_plan_pp_planrl   r4   r   r   r   r5   r   r   r   r   r   listr   r   r   r   r7   r   r   re   rf   s   @r:   r   r     s        HH2 2 2 2 2 ,0595959 # "% !!12	
   12 !!12     ,  15BF1537KO59-1$(5934@
 @
E,-@
 %-U->$?@
 !.	@

 u/0@
 "%tE4E/F(F"GH@
   12@
 )*@
 D>@
 !!12@
 c5</0@
 +,@
 
u,,	-@
 @
 @
 ^ @
 @
 @
 @
 @
r9   r   c                   $     e Zd Z fdZd Z xZS )CsmBackboneModelEmbeddingsc                    t                                                       t          j        |j        |j        z  |j                  | _        |                     dt          j
        |j                  |j        z  d           d S )Naudio_tokens_offsetsF)
persistent)rN   rl   rm   rn   rR   ro   rs   embed_audio_tokensregister_bufferr4   r   ru   s     r:   rl   z#CsmBackboneModelEmbeddings.__init__{  s    "$,0DvGX0X[a[m"n"n"EL1E$F$FIZ$Zgl 	 	
 	
 	
 	
 	
r9   c                 l    |                      || j        z             }|                    d          }|S )Nr   r   )r   r   sum)rX   rv   r   s      r:   r   z"CsmBackboneModelEmbeddings.forward  s9    ..y4;T/TUU#''A'..r9   r   rf   s   @r:   r   r   z  sG        
 
 
 
 
      r9   r   c                   H     e Zd Z fdZee fd                        Z xZS )CsmBackboneModelc                 r    t                                          |           t          |          | _        d S r   )rN   rl   r   rq   ru   s     r:   rl   zCsmBackboneModel.__init__  s1       6v>>r9   c                 6     t                      j        di |S )a&  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        r8   )rN   r   )rX   super_kwargsr[   s     r:   r   zCsmBackboneModel.forward  s!     uww.....r9   )r0   r1   r2   rl   r   r   r   re   rf   s   @r:   r   r     sh        ? ? ? ? ? / / / / ^ / / / / /r9   r   z
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c                   
    e Zd ZddgZ fdZd Zd Zd Ze fd            Z	 fdZ
	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 f
dZ	 	 	 	 dd
ej        dee         deej                 deej                 deej                 f
 fdZee	 	 	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deeeeej                 f                  deej                 deej                 dee         deej                 deeej        f         dee         deeef         fd                        Z xZS )CsmForConditionalGenerationz5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                    t                                          |           |j        | _        t          j        |j        |j        d          | _        t          j        |j        |j                  | _	        t                              |          | _        t                              |j                  | _        t!          j        |j                  | _        |                                  d S rj   )rN   rl   ro   rm   rr   rs   r   rn   text_vocab_sizeembed_text_tokensr   _from_configbackbone_modelr   depth_decoder_configdepth_decoderr   from_configcodec_configcodec_model	post_initru   s     r:   rl   z$CsmForConditionalGeneration.__init__  s        +y!3V5FUSSS!#f.DfFX!Y!Y.;;FCC7DDVE`aa$01DEEr9   c                     | j         j        S r   r   rq   rX   s    r:   get_input_embeddingsz0CsmForConditionalGeneration.get_input_embeddings  s    "//r9   c                     || j         _        d S r   r   )rX   values     r:   set_input_embeddingsz0CsmForConditionalGeneration.set_input_embeddings  s    +0(((r9   c                     | j         j        r6|                     | j        j        j        | j        j        j                   d S d S r   )rI   tie_codebooks_embeddings_tie_or_clone_weightsr   rq   r   r   rJ   r   s    r:   _tie_weightsz(CsmForConditionalGeneration._tie_weights  sS    ;/ 	&&#0C"(5    	 	r9   c                    |                     dd          r t                      j        |i |\  }}n t                      j        |i |}dt                    fdt	          |j                                                  D             }t	          |j        j                                      ddi|           |D ]}t          |j        |z              d|v r||fS |S )Noutput_loading_infoFdepth_decoder_c                 V    i | ]%\  }}|                               |d          |&S r   )
startswith)r   attrr   prefix
prefix_lens      r:   
<dictcomp>z?CsmForConditionalGeneration.from_pretrained.<locals>.<dictcomp>  sJ     
 
 
ev&&
u
 
 
r9   _from_model_config)
getrN   from_pretrainedlenvarsgeneration_configitemsr   updatedelattr)
clsargsr}   rJ   loading_infodepth_decoder_attrsr   r   r   r[   s
          @@r:   r   z+CsmForConditionalGeneration.from_pretrained  s'   ::+U33 	="9%''"94"J6"J"JE<<+EGG+T<V<<E "[[

 
 
 
 
#E$;<<BBDD
 
 
 	U 233::<PRW;o[n;oppp ( 	< 	<DE+Vd];;;; F**,&&Lr9   c                    d}| j         j                                        }|                    dd            |                                D ]\  }}t          | j        ||z   |            t                      j        |i | d S )Nr   transformers_version)r   r   to_diff_dictr   r  setattrrN   save_pretrained)rX   r  r}   r   r  r   r   r[   s          r:   r  z+CsmForConditionalGeneration.save_pretrained  s    !"0BOOQQ 6===.4466 	B 	BKD%D*FTM5AAAA000000r9   Nrv   input_valuesinput_values_cutoffsr   r~   c                    |                      |          }|Lt          j                            |d          }||dk                                             }||dk             }t          j        |                                |j                  	                    t          |          d          }||                    d          k     }t          j                    5  g }t          ||          D ]\  }	}
|
|
dk             }
t          |
j        d         dz
            D ]}|
|         }|
|dz            }|	d||f         }| j                            |                    d                    }|j                            dd          }|                    |d                    t          d |D                       t          j        fd	|D                       }| j                            |          }ddd           n# 1 swxY w Y   | j        j        }||k    }| j                            |          }||         ||<   t          j        dd| j        j        f|j        t
          j        
          | j        j        z  }| j                            |                              d          }|| j        j         k    }|!                    |"                                d          ||<   |v|                    d          !                    dd| j        j                  }||         ||<   |||<   |dk    #                    d          }d||d         |d         ddf<   |}||dS )a  
        Merges the input_ids and input_values to produce a single inputs_embeds tensor:
        1 - Infers the codec model on the input_values to retrieve codebook token.
        2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
        3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

        Args:
            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
                The input ids to embed.
            input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
                The audio input values to embed.
            input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
                The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
        Nr   r   r   r   r   .c              3   0   K   | ]}|j         d          V  dS )r   N)r   )r   els     r:   	<genexpr>zQCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>  s(      &O&Orrx{&O&O&O&O&O&Or9   c                 t    g | ]4}t           j                            |d d d |j        d          z
  f          5S )r   )rm   r   padr   )r   r  max_audio_framess     r:   r   zRCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<listcomp>  sB    rrrZ\R]&&rAq!5EQR5S+TUUrrrr9   )r   dtypeiTas_tuple)rz   r   )$r   rm   r   r  diffr4   r   maxr   expandr   r   no_gradziprS   r   r   encodeaudio_codes	transposeappendr   get_audio_codes_maskrI   audio_token_idr   rq   onesrR   longcodebook_eos_token_idsqueezeaudio_eos_token_idrepeatr   nonzero)rX   rv   r  r  r   rz   audio_lengthsinput_values_maskaudio_tokens_listbatch_input_valuesbatch_input_values_cutoffsrZ   	start_idxend_idxaudio_batchcodec_outputscodebook_idsbatched_audio_token_idsaudio_codes_maskr&  audio_token_maskaudio_embedsaudio_eos_frame_idsaudio_eos_embedsaudio_eos_token_masklabels_expanded depth_decoder_ignore_frames_idxsr  s                              @r:   "_merge_input_ids_with_input_valuesz>CsmForConditionalGeneration._merge_input_ids_with_input_values  s   * ..y99##%=#4#45I6#R#R 01E1JKPPRRM)-!*;<M %-A-E-E-G-GP\Pc d d d k kM""B! ! !2M4K4KA4N4N N
  \ \$&!FI,XlFmFm B BB&(B1KLfjkLk1l."#=#CA#F#JKK B B$>q$A	"<QU"C&8i>O9O&P(,(8(?(?@U@UVW@X@X(Y(Y'4'@'J'J1b'Q'Q)00aAAAAB $'&O&O=N&O&O&O#O#O */+rrrr`qrrr+ +' $(#3#H#HIZ#[#[ !\ \ \ \ \ \ \ \ \ \ \ \ \ \ \$ "[7N(N:.;;<STTL.:;K.LM*+ 
Aq$+";<YEU]b]ghhh+34    $2??@STT\\]^__#,0N#N 2B2I2IJ^JbJbJdJdfg2h2hM./ !"("2"22"6"6"="=aDKD]"^"^4KL\4] 018K 454:dN3K3KUY3K3Z3Z0pt @ CEefgEhjkjljl lm(!.&AAAs   DHHHr'   rx   rz   r|   c           	      x    t                      j        d	|||||d|}||j        dk    r|                    d          w|                     ||                    d          |                    d          |                    d                    }|                    |d         |d         d d           |S )
N)rv   r'   rx   rz   r|   r   rz   r  r  r   )rv   r  r  r   )rz   r   rv   r8   )rN   r   ndimr   rA  r  )
rX   rv   r'   rx   rz   r|   r}   r   merged_inputsr[   s
            r:   r   z9CsmForConditionalGeneration.prepare_inputs_for_generation9  s     =uww< 
+)')
 
 
 
  Y^q%8%8\=M=Mo=^=^=f CC##ZZ77%+ZZ0F%G%Gzz(++	 D  M "/"@MZbLcrvww   r9   r   ry   r{   r   r}   c                    |5|j         dk    r*|                     ||||          }|d         }|d         }d} | j        d||||||	|
d|}|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}d}d}d}||dddddf         } | j        d||| j        j	        d|}|ddddddf         d	k    
                    d
           }||         dd| j        j        dz
  f         }t          j                            |dd          }|                    d          }||d         |d         dz
  ddf         }||         } | j        d|||	d|d|}|j        }||z   }t%          |||||j        |j        |j        ||j        nd||j        nd||j        nd||j        nd          S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
            Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
            If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
            where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
            the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
            Requires targeted `input_values` to be provided as audio tokens will be inferred from it using the `codec_model`.
            - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
            - `-100` will be ignored in the loss computation
            - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

            Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            Kept for compatibility. Does not support another value than:
            1. `0`, which is equivalent to keeping all logits, used in the training regime
            2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

        Example:

        ```python
        >>> import torch
        >>> from transformers import CsmForConditionalGeneration, AutoProcessor
        >>> from datasets import load_dataset, Audio

        >>> model_id = "sesame/csm-1b"
        >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

        >>> processor = AutoProcessor.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        >>> # ensure the audio is 24kHz
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

        >>> conversation = []
        >>> # prepare a conversation with text and corresponding audio
        >>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
        ...     conversation.append(
        ...         {
        ...             "role": f"{speaker_id}",
        ...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
        ...         }
        ...     )

        >>> inputs = processor.apply_chat_template(
        ...     conversation,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     output_labels=True,
        ... ).to(torch_device)

        >>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
        >>> output = model(**inputs)
        >>> output.loss.backward()
        ```Nr   rz   r   )rv   rx   ry   r'   rz   r{   r|   r   )r&   r   ro   r   r  r  r   .r  )r   Tr  )rv   rw   r{   return_dictr   )r%   r/   r*   r&   r'   r(   r)   r+   r,   r-   r.   r8   )rC  rA  r   rP   r   r   r   r   rI   ro   allrR   rm   r   r  r-  r   r%   r$   r'   r(   r)   r&   )rX   rv   r  rx   r  ry   r'   rz   r   r{   r|   r   r}   rD  backbone_outputsbackbone_hidden_statesr   backbone_logitsr%   r/   r*   depth_decoder_outputsbackbone_labels
train_maskdepth_decoder_input_ids
train_idxsbackbone_last_hidden_statesdepth_decoder_labelss                               r:   r   z#CsmForConditionalGeneration.forwardX  s   f  Y^q%8%8 CC<)=v M */:M"8,FI.4. 	
)%+')	
 	
 	
 	
 "2!!48B>SV8W8Wk~ot444]k,,'=aaaPQPQPQ>Q'RSS! $$QQQ1WoO.D. &4;Ka ek M "!!!QQQ(+t388R8@@@J&,Z&8>]@Y\]@]>]9]&^#&(m&7&78OQW_`&7&a&a##++T+::J*@APZ[\P]`aPacdcdcdAd*e'#)*#5 $6D$6 %1+F# +% % % %! "7!; #55D '1",<*8'2AVAb!6!=!=hl$0 +@*O*O$0 )>(K(KI^Ij%:%E%Ept
 
 
 	
r9   r   )NNNNNNNNNNr   )r0   r1   r2   r   rl   r   r   r   classmethodr   r  r   r4   r   rA  r   r   r5   r   r   r   r   r   r   r   r   r   r7   r$   r   re   rf   s   @r:   r   r     s!        	@1
    0 0 01 1 1       [41 1 1 1 1 -1/37;)-PB PBEL)PB u|,PB 'u|4	PB
 &PB 
%,	PB PB PB PBj ,0595959 # "% !!12	
   12 !!12     >  15/3157;37KO59-1$(5934[
 [
E,-[
 u|,[
 !.	[

 'u|4[
 u/0[
 "%tE4E/F(F"GH[
   12[
 )*[
 D>[
 !!12[
 c5</0[
 +,[
 
u''	([
 [
 [
 ^ [
 [
 [
 [
 [
r9   r   )rH   r   rh   r   r   )?dataclassesr   typingr   r   r4   torch.nnrm   transformers.utils.genericr   cache_utilsr   r	   
generationr
   masking_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   autor   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_csmr   r    generation_csmr!   
get_loggerr0   r   r$   r<   r@   rB   rD   rF   rH   rh   ModulerQ   r   r   r   r   __all__r8   r9   r:   <module>re     s    " ! ! ! ! ! " " " " " " " "        9 9 9 9 9 9 . . . . . . . . ) ) ) ) ) ) / / / / / / O O O O O O O O - - - - - - & & & & & & K K K K K K K K K K K K      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 @ ? ? ? ? ? ? ? . . . . . . 
	H	%	%   
'6 '6 '6 '6 '6 '6 '6  '6V	 	 	 	 	 	 	 		 	 	 	 	- 	 	 		 	 	 	 	X 	 	 		 	 	 	 	> 	 	 		 	 	 	 	' 	 	 	   
 [ [ [ [ [ [ [  [4 \
 \
 \
 \
 \
:'9 \
 \
 \
~    ry   .   c
 c
 c
 c
 c
!1? c
 c
 c
L        / / / / /z / / /.   
P
 P
 P
 P
 P
"46H P
 P
 
P
f
  r9   