
     `i@                     $   d dl Z d dlmZ d dlmZmZmZ d dlZddl	m
Z
mZ  e            rd dlZ e
            rd dlZddlmZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ  G d
 ded          Z G d ded          Z G d de          ZdgZdS )    N)Path)AnyOptionalUnion   )is_soundfile_availableis_torch_available)
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   6    e Zd ZU eeeef                  ed<   dS )CsmAudioKwargsencoded_length_kwargsN)__name__
__module____qualname__r   dictstrr   __annotations__     z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/csm/processing_csm.pyr   r   %   s+         #DcN333333r   r   F)totalc                   N    e Zd ZU eed<   ddddg dg dg ddd	d
dddidZdS )CsmProcessorKwargsaudio_kwargsTleftF)paddingpadding_sideadd_special_tokens)   r         r   r(   
   r   r(      r   r(      r      )r(   r(   r(   r-   r(   r(      r(   r(      r(   r(   r)   r(      )r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   )kernel_sizesstrides	dilationsuse_causal_convi]  )r   sampling_ratereturn_tensorspt)text_kwargsr"   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r!   r!   )   s              ""'
 
 !Q P PHHHJJJ#'	& & #
 
 +D1 IIIr   r!   c                   >    e Zd ZdZddgZdZdZ	 d fd	Zedd            Z	d	e
d
eeeeeeef                  f         dee         fdZ	 	 	 ddeeeeee         ee         f                  d	ee
         dee         dee         dee         f
dZed             Z xZS )CsmProcessora  
    Constructs a Csm processor which wraps [`EncodecFeatureExtractor`] and
    [`PretrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
    tokenizer functionalities. See the [`~CsmProcessor.__call__`] for more
    information.
    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
        ```python
        from transformers import CsmProcessor
        from datasets import load_dataset

        ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        audio = ds[0]["audio"]["array"]

        processor = CsmProcessor.from_pretrained("sesame/csm-1b")

        processor(
            text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
            audio=audio,
            text_kwargs = {"padding": False},
            audio_kwargs = {"sampling_rate": 16000},
            common_kwargs = {"return_tensors": "pt"},
        )
        # this should error out because EncodecFeatureExtractor expects a 24kHz audio :)
        ```

    Args:
        feature_extractor ([`EncodecFeatureExtractor`]):
            The feature extractor is a required input.
        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.

    feature_extractor	tokenizerEncodecFeatureExtractorPreTrainedTokenizerFastNc                    t          |d          s'd| _        |                    | j                  | _        n|j        | _        |j        | _        t          |d          s'd| _        |                    | j                  | _        n|j        | _        |j        | _        t                                          |||           d S )Naudio_tokenz	<|AUDIO|>audio_eos_tokenz<|audio_eos|>)chat_template)hasattrrB   convert_tokens_to_idsaudio_token_idrC   audio_eos_token_idsuper__init__)selfr=   r>   rD   	__class__s       r   rJ   zCsmProcessor.__init__f   s     y-00 	;*D"+"A"A$BR"S"SD(4D"+":Dy"344 	C#2D &/&E&EdFZ&[&[D###,#<D &/&BD#*I]SSSSSr   c                 <   | }|||||S t          |||          D ]}\  }}}|dz
  |z  dz   }	||z
  }
|
dz  }|
|z
  }||	z
  |
z   |z  dz   }t          j        |          dz
  }||z  |z   |
z
  }||z
  }|r|
}|}n||z   }||z   |z   }|||dz
  z  z
  dz
  |z  dz   }~|S )a|  
        Compute the length of the encoded audio sequence.

        Args:
            audio_length (int): The length of the audio sequence.
            kernel_sizes (list[int]): The kernel sizes for the convolutional layers.
            strides (list[int]): The strides for the convolutional layers.
            use_causal_conv (bool): Whether to use causal convolutions.
        Nr(   r0   )zipmathceil)audio_lengthr1   r2   r3   r4   
cur_lengthkernel_sizestridedilationeffective_kernel_sizepadding_totalpadding_rightpadding_leftn_framesideal_lengthextra_paddings                   r   _get_encoded_lengthz CsmProcessor._get_encoded_length|   s    "
7?i6G?Kb-0w	-R-R 	W 	W)K%01_$@1$D!'&0M)Q.M(=8L"%::]JfTWXXHy**Q.H#f,{:]JL(:5M >, - - =#l2]BJ$x;?'CCaGFRUVVJJr   audiosaving_pathkwargsc                    t                      st          d          t          |          }t          |t          t
          f          r|g}nDt          |t          t          f          rt          d |D                       st          d          t          |          t          |          k    rt          d           | j        t          fi |}|d         }|d         }t          ||          D ]m\  }}t          |t          j                  r8|                                                                                                }t'          j        |||           nd S )Nz/Please install `soundfile` to save audio files.c              3   N   K   | ] }t          |t          t          f          V  !d S N)
isinstancer   r   ).0ps     r   	<genexpr>z*CsmProcessor.save_audio.<locals>.<genexpr>   s3      @q@q`aAPSUY{A[A[@q@q@q@q@q@qr   zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer"   r5   )r   ImportErrorr   rd   r   r   listtupleall
ValueErrorlen_merge_kwargsr!   rN   torchTensorcpufloatnumpysfwrite)	rK   r^   r_   r`   output_kwargsr"   r5   audio_valuerf   s	            r   
save_audiozCsmProcessor.save_audio   su    &'' 	QOPPP #5)) kC;// 	b&-KK[4-88 	bS@q@qep@q@q@q=q=q 	b`aaau::[))))TUUU**
 

 
 %^4$_5!%55 	4 	4NK+u|44 @)oo//5577==??HQ]3333	4 	4r   F      ?textoutput_labelsdepth_decoder_labels_ratioc           
      	       j         t          fd j        j        i|}|d         }|d         }|d         }	|	                    dd          }
|
dk    rt           j        j         d          t          |t                    r|g}nDt          |t          t          f          rt          d	 |D                       st          d
           fd|D             }d}|t          |          }t          |          }t          |          dk    r:|t          |          k    r'|t          d          t          d| d| d          ||                    di            fd|D             }|                                }g }|D ]}g } j        |v rY|                    d          } j        |z  }|                    |           |                     j        dd          } j        |v Yd|v r.|                    d|                    d          d          }d|v .|                    |           |}  j        |fi |}i }|                    |           ||                    dd           g g }}d}|D ]}|dk    rP|                    t+          j        d                     |                    t/          j        dg                     X|                    t+          j        d ||||z            D             d                     |                    t/          j        d ||||z            D                                           d                     ||z  }  j        |fi |}|                    dd           |                    |           t9          d |D                         fd|D             }t/          j        |d          |d<   |r|d          j        k                                    }|j         d         }|d k    r8t/          j!        |          dtE          |d|z
  z                     }||         }n|}t/          j#        |d          j        k    |d          j$        k    z  |d         d!          }d"||dddf         |dddf         f<   ||d#<   tK          ||
$          S )%a  
        Main method to prepare text(s) and audio to be fed as input to the model. This method forwards the `text`
        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode
        the text. To prepare the audio, this method forwards the `audio` arguments to
        EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`]. Please refer
        to the docstring of the above two methods for more information.

        Args:
            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
                tensor.
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            output_labels (bool, *optional*, default=False):
                Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
                - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
                - `-100` will be ignored in the loss computation
                - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
            depth_decoder_labels_ratio (float, *optional*, default=1.0):
                The ratio of audio frames to keep for the depth decoder labels.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **labels** -- List of labels for the audio frames. Returned when `output_labels=True`.
        tokenizer_init_kwargsr8   r"   r9   r6   Nr7   z% only supports `return_tensors='pt'`.c              3   @   K   | ]}t          |t                    V  d S rc   )rd   r   )re   ts     r   rg   z(CsmProcessor.__call__.<locals>.<genexpr>  s-      9[9[QR*Q:L:L9[9[9[9[9[9[r   zAInvalid input text. Please provide a string, or a list of stringsc                 D    g | ]}|                     j                  S r   )countrB   )re   r   rK   s     r   
<listcomp>z)CsmProcessor.__call__.<locals>.<listcomp>  s(    CCC1774#344CCCr   r   z@No audio were provided, but there are audio tokens in the promptz)The number of audio tokens in each text (z7) should be the same as the number of provided audios (z).r   c                 B    g | ]} j         |j        d          fi S )r]   shape)re   audio_arrayr   rK   s     r   r   z)CsmProcessor.__call__.<locals>.<listcomp>  sE     % % %]h(():2)>XXBWXX% % %r   z<placeholder>r(   return_attention_maskr   c                     g | ]D}t          |t          j                  r&|                                                                n|ES r   )rd   ro   rp   rq   rs   re   els     r   r   z)CsmProcessor.__call__.<locals>.<listcomp>=  sQ       $& 5?r5<4P4P X 0 0 0VX  r   )axisc                 (    g | ]}|j         d          S r   r   r   s     r   r   z)CsmProcessor.__call__.<locals>.<listcomp>E  s    %^%^%^rbhrl%^%^%^r   )dimpadding_maskc              3   0   K   | ]}|j         d          V  dS )r   Nr   )re   cut_idxss     r   rg   z(CsmProcessor.__call__.<locals>.<genexpr>N  s)      RR(.,RRRRRRr   c           	      ~    g | ]9}t           j        j                            |d |j        d         z
  fd          :S )r   r   )value)ro   nn
functionalpadr   )re   r   max_lens     r   r   z)CsmProcessor.__call__.<locals>.<listcomp>O  sU     $ $ $ #''1gr@R6R2S[]'^^$ $ $r   input_values_cutoffs	input_idsry   iilabels)datatensor_type)&rn   r!   r>   init_kwargspoprl   rL   r   rd   r   ri   rj   rk   r   rm   sumcopyrB   appendreplaceupdatenpzerosro   tensorconcatenatecumsumr=   maxstackrG   nonzeror   randpermintwhererH   r   )!rK   rz   r^   r{   r|   r`   rv   r8   r"   r9   r6   n_audio_in_textn_audionum_audio_tokens_listnum_audio_tokens_list_copyexpanded_textsamplereplace_strnum_audio_tokensexpanded_audio_tokenencodingr   concatenated_audior   offsetaudio_inputsaudio_frame_idxsn_audio_frames	rand_idxsskip_frames_idxsr   r   r   s!   `                              @@r   __call__zCsmProcessor.__call__   s   ^ +*
 
"&."<
 
 
 $M2$^4%o6&**+;TBBT!! 7^^^___dC   	b6DDTD%=11 	bc9[9[VZ9[9[9[6[6[ 	b`aaaCCCCdCCC&u--E%jjG!##33G3G(G(G} !cddd > > >29> > >  
 $0$4$45Lb$Q$Q!% % % % %lq% % %! *?)C)C)E)E& M - - &&00'A'E'Ea'H'H$+/+;>N+N(&&';<<<#^^D,<oqQQF &&00 &//#^^O[__Q=O=OQRSSF &//$$V,,,, D!4>$66+66H4d;;;792 4F* & &a<<&--bhqkk:::(//bT0B0BCCCC&-- */'9I0I*J   "$     )//%^%^U6FU\L\C\=]%^%^%^__ffkmfnn   g%FF1412DUUUUL^T222KK%%% RR=QRRRRRG$ $ $ $ 4$ $ $  ,1;7KQR+S+S+SD'( 	$ $[ 1T5H HQQSS-3A6N)S00!N>::;sSSTWqSqAr=s=s;st	#3I#>  #3 [k"d&99d;>OSWSj>jk[! F
 FJF#AAAqD)+;AAAqD+AAB#DN>BBBBr   c                 v    | j         j        }| j        j        }d |D             }t          ||z   dgz             S )Nc                     g | ]
}|d k    |S )r   r   )re   names     r   r   z2CsmProcessor.model_input_names.<locals>.<listcomp>q  s$    (r(r(r$[_cq[q[q[q[q[qr   r   )r>   model_input_namesr=   ri   )rK   tokenizer_input_namesfeature_extractor_input_namess      r   r   zCsmProcessor.model_input_namesj  sO     $ @(,(>(P% )s(r:W(r(r(r%),IIMcLddeeer   rc   )NNNN)NFry   )r   r   r   __doc__
attributesfeature_extractor_classtokenizer_classrJ   staticmethodr]   r
   r   r   r   ri   r   r!   rx   r   r   r   boolrr   r   propertyr   __classcell__)rL   s   @r   r<   r<   >   s       ! !F &{3J7/O 	T T T T T T, # # # \#J 4 4 3d5d+;&<<= 4 +,	 4  4  4  4J '+(-69dC dCuY(94	?DQbLccdedC 
#dC  ~	dC
 %-UOdC +,dC dC dC dCL f f Xf f f f fr   r<   ) rO   pathlibr   typingr   r   r   rs   r   utilsr   r	   ro   	soundfilert   audio_utilsr
   r   feature_extraction_utilsr   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   r!   r<   __all__r   r   r   <module>r      s           ' ' ' ' ' ' ' ' ' '     ? ? ? ? ? ? ? ?  LLL  9 9 9 9 9 9 9 9 4 4 4 4 4 4 U U U U U U U U U U U U C C C C C C C C4 4 4 4 4[ 4 4 4 4    )    *tf tf tf tf tf> tf tf tfn	 
r   