
     `iN                        d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZmZ dd	lmZmZ  e            rddlZ e            rddlZ G d
 ded          Z G d ded          Z G d de          ZdgZdS )zProcessor class for Dia    N)Path)OptionalUnion   )
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)is_soundfile_availableis_torch_availablec                   N    e Zd ZU eed<   eed<   eed<   ee         ed<   eed<   dS )DiaAudioKwargsbos_token_ideos_token_idpad_token_iddelay_pattern
generationN)__name__
__module____qualname__int__annotations__listbool     z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/dia/processing_dia.pyr   r   "   sR         9r   r   F)totalc                   D    e Zd ZU eed<   dddddddg d	dd
dddidZdS )DiaProcessorKwargsaudio_kwargsTrightF)paddingpadding_sideadd_special_tokensi   i  i  )	r      	   
                  iD  )r   r   r   r   r   sampling_ratereturn_tensorspt)text_kwargsr$   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r    r#   r#   *   sk              #"'
 
 !  >>>"
 
 +D1 IIIr   r#   c                       e Zd ZdZdZdZdZ fdZ	 	 d"dee	e
e	         f         d	ee         d
ee         dee         fdZ	 d#dddee         dee         de
d         fdZ	 d#dddee         dee         ddfdZdddee         defdZd	edee	ee
ee	ef                  f         dee         fdZe	 d$dededede
e         deded         fd            Zed	ddeded ed         ddf
d!            Z xZS )%DiaProcessora  
    Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into
    a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio-
    nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more
    information.

    Args:
        feature_extractor (`DiaFeatureExtractor`):
            An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`DiaTokenizer`):
            An instance of [`DiaTokenizer`]. The tokenizer is a required input.
        audio_tokenizer (`DacModel`):
            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
    DiaFeatureExtractorDiaTokenizerDacModelc                 P    t                                          |||           d S )N)audio_tokenizer)super__init__)selffeature_extractor	tokenizerr=   	__class__s       r    r?   zDiaProcessor.__init__R   s)    *IWWWWWr   NFtextaudiooutput_labelskwargsc           
      *   t                      st          d          |t          d           | j        t          fi |}|d         }|d         }|d         }|                    dd          }	|	dk    rt          | j        j         d	          i }
t          |t                    r|g}nDt          |t          t          f          rt          d
 |D                       st          d           | j        |fi |}|
                    |           |                    dd          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }||||t          d          |r|rt          d| d| d          |
d         j        d         }t          |          }t!          |          }|Ht#          |          } | j        |fi |}t'          j        | j        j        j                  }|d         d         j        d         |z  }g }g }t1          |d         |d                   D ]\  }}| j        j        }t'          j        |                    d          |z            |z  }||z  }||z
  }t9          j                    5  |ddd|f                             | j        j                  }| j                             |          j!        "                    dd          }ddd           n# 1 swxY w Y   |s(t8          j#        j$        %                    |dd |!          }t8          j#        j$        %                    |dd|dz   dddfd |!          }|dz   |z   }||rdndz  }t9          j&        dg|z  dg|z  z   t8          j'        "          dddf         } |(                    |           |(                    |            t9          j)        |d          }t9          j)        |d          }n[|rJt9          j*        |d|f|t8          j'        "          }t9          j+        |d|z   ft8          j'        #          }nt          d$          ||j        d         k    r!t          d%| d&|j        d          d'          |j        d         }!|!|z
  }"| ,                    ||!||d()          }#t9          j*        ||!|f|t8          j-        *          }$||$ddd|"f<   | .                    |$|||#+          }%|
                    |%|d,           |r|
d-         /                                ddddf         }&d.|&|&|k    <   d.|&|&|k    <   |&"                    dd          0                    ||z  d          1                                '                                |
d/<   |
d-         ddddf         |
d-<   |
d0         ddddf         |
d0<   te          |
|	1          S )2a  
        Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is
        forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the
        DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer
        to the docstring of the above methods for more information.
        zThe `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't find it in your environment. You can install torch via `pip install torch`.Nz0You need to specify the `text` input to process.r4   r$   r5   r2   r3   z% only supports `return_tensors='pt'`.c              3   @   K   | ]}t          |t                    V  d S N)
isinstancestr).0ts     r    	<genexpr>z(DiaProcessor.__call__.<locals>.<genexpr>}   s-      9[9[QR*Q:L:L9[9[9[9[9[9[r   zAInvalid input text. Please provide a string, or a list of stringsr   r   r   r   r   TzTo enable processing for Dia, we need the `bos_token_id`, `eos_token_id`, `pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those.z9Labels with `generation` is incompatible, got generation=z, output_labels=.	input_idsr   padding_maskinput_valuesdim.      )r   r   r   rW   r   r   constant)padmodevaluedtype)sizer^   z;If you try to train, you should provide audio data as well.zNNeed the same amount of samples for both text and audio, but got text samples=z and audio samples = z	 instead.Fbszseq_lennum_channelsr   revert)
fill_valuer^   rE   r   r   precomputed_idx)decoder_input_idsdecoder_attention_maskrh   ilabelsri   )datatensor_type)3r   
ValueError_merge_kwargsr#   poprC   r   rK   rL   r   tupleallrB   updateshapelenmaxr   rA   mathprodr=   configdownsampling_ratioszip
hop_lengthceilsumtorchno_gradtodeviceencodeaudio_codes	transposenn
functionalrZ   tensorlongappendcatfullonesbuild_indicesr   apply_audio_delayclonereshape
contiguousr	   )'r@   rD   rE   rF   rG   output_kwargsr4   r$   r5   r2   rk   	encodingsr   audio_bos_token_idaudio_eos_token_idaudio_pad_token_idr   
batch_sizerc   	max_delayinput_audioscompression_ratemax_encoded_sequence_lenrh   ri   rR   base_pad_lencurrent_audio_lenencoded_sequence_lenpadding_lenrQ   num_valid_inputsattention_maskmax_seq_lenmax_audio_lenrg   prefilldelayed_decoder_input_idsrj   s'                                          r    __call__zDiaProcessor.__call__U   s    "## 	^  
 <OPPP**
 

 

 $M2$^4%o6&**+;TBBT!! 7^^^___ dC   	b6DDTD%=11 	bc9[9[VZ9[9[9[6[6[ 	b`aaa"DN477;77	I %(($??)--ndCC)--ndCC)--ndCC!%%lD99
&!)!)$k  
  	- 	xJxxhuxxx   +&,Q/
=))&&	 &u--E141%HH<HHL#y)=)D)XYY'3N'CA'F'LR'PTd'd$ "%'" (+<+GVdIe'f'f > >#e#5@$(Il.>.>2.>.F.F.U$V$VYe$e!'8<L'L$69MM ]__ _ _!$-?.?-?"?@CCDDXD_``E $ 4 ; ;E B B N X XYZ\] ^ ^I_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ "  % 3 7 7!'9
Rd !8 ! !I "H/33Aq+/1a#C*\n 4  	 $8!#;i#G  $:AA: !&qcK.?1#HXBX.X`e`j!k!k!klprsrsrsls!t!((333&--n==== %	*; C C C%*Y/E1%M%M%M"" 	\ %
J<+HJ\didn o o o &+Zj!i-5PX]Xb%c%c%c""Z[[[*0333Iak I I#4#:1#=I I I   -226#i/,,%' - 
 
 *l3))
 
 

 &7>M>!"$($:$:+++	 %; %
 %
! 	*C_uvvwww 	T-.4466qqq!""u=F37F6//037F6//0#--a33;;J<UWYZZeeggllnnDN(,-@(A!!!SbS&(ID$%-12J-KAAAsPRsF-SD)*>BBBBs    AL**L.	1L.	rh   torch.Tensoraudio_prompt_lenreturnc                     | j         t          fi |}|d         }|                    dd          }|                    dd          }|                    dd          }|||t          d          |Mt	          j        ||j        t          j                  }|d                             |j	        d                   }	n'|dddddf         |k    
                    d	
          }	|j	        d         |dddddf         |k    
                    d	
          z
  dz
  }
|j	        \  }}}|                     ||||d          }|                     |d	d	|                              dd          }g }t	          j                    5  t          |	j	        d                   D ]}||dd|	|         |
|         f         d         }|                    | j        j                  }| j                            |          j                                                                        }|                    |           	 ddd           n# 1 swxY w Y   |S )a  
        Decodes a batch of audio codebook sequences into their respective audio waveforms via the
        `audio_tokenizer`. See [`~DacModel.decode`] for more information.

        Args:
            decoder_input_ids (`torch.Tensor`): The complete output sequence of the decoder.
            audio_prompt_len (`int`): The audio prefix length (e.g. when using voice cloning).
        r$   r   Nr   r   zTo enable decoding for Dia, we need the `bos_token_id`, `pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those.)r   r^   r   rS   rU   rW   Tr`   rf   rX   )N.)r   )rn   r#   ro   rm   r~   r   r   r   expandrs   r}   r   r   r   r   ranger   r=   decodeaudio_valuescpusqueezer   )r@   rh   r   rG   r   r$   r   r   r   start_of_generation_idxend_of_generation_idxra   rb   rc   rg   output_sequencesaudiosioutput_iaudio_is                       r    batch_decodezDiaProcessor.batch_decode  s    +*
 

 
 %^4$(($??)--ndCC)--ndCC%);)C}G\[   '$|,<EVE]ejeoppp&6t&<&C&CDUD[\]D^&_&_##'8AAAq'AEW'W&\&\ac&\&d&d# #A&*;AAAqqq!G*DHZ*Z)_)_df)_)g)ggjkk 	
 &7%<"Wl,,%' - 
 
  11# + 2 
 
 )Aq// 	 ]__ 	' 	'28;<< ' '+Aqqq2I!2LOdefOg2g,ghirs#;;t';'BCC.55(5KKX\\^^ffhhg&&&&	'	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' s   B7IIIc                     |j         d         dk    rt          d|j         d          d           | j        ||fi |d         S )z
        Decodes a single sequence of audio codebooks into the respective audio waveform via the
        `audio_tokenizer`. See [`~DacModel.decode`] and [`~DiaProcessor.batch_decode`] for more information.
        r   rW   z5Expecting a single output to be decoded but received z samples instead.)rs   rm   r   )r@   rh   r   rG   s       r    r   zDiaProcessor.decodeI  sh     "1%**uHYH_`aHbuuu   !t !24DOOOOPQRRr   ri   c                      | j         t          fi |}|d         }|                    dd          }|t          d          |j        d         t          |          z
  S )z0Utility function to get the audio prompt length.r$   r   NzTo enable the utility of retrieving the prompt length for Dia, we need the `delay_pattern`. You may have accidentally overwritten this.rW   )rn   r#   ro   rm   rs   ru   )r@   ri   rG   r   r$   r   s         r    get_audio_prompt_lenz!DiaProcessor.get_audio_prompt_lenZ  s     +*
 

 
 %^4$(($?? O   &+A.]1C1CCCr   saving_pathc                    t                      st          d          t          |          }t          |t          t
          f          r|g}nDt          |t          t          f          rt          d |D                       st          d          t          |          t          |          k    rt          d           | j        t          fi |}|d         }|d         }t          ||          D ]m\  }}t          |t          j                  r8|                                                                                                }t'          j        |||           nd S )Nz/Please install `soundfile` to save audio files.c              3   N   K   | ] }t          |t          t          f          V  !d S rJ   )rK   rL   r   )rM   ps     r    rO   z*DiaProcessor.save_audio.<locals>.<genexpr>  s3      @q@q`aAPSUY{A[A[@q@q@q@q@q@qr   zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer$   r1   )r   ImportErrorr   rK   rL   r   r   rp   rq   rm   rt   rn   r#   rz   r~   Tensorr   floatnumpysfwrite)	r@   rE   r   rG   r   r$   r1   audio_valuer   s	            r    
save_audiozDiaProcessor.save_audioo  su    &'' 	QOPPP #5)) kC;// 	b&-KK[4-88 	bS@q@qep@q@q@q=q=q 	b`aaau::[))))TUUU**
 

 
 %^4$_5!%55 	4 	4NK+u|44 @)oo//5577==??HQ]3333	4 	4r   ra   rb   rc   r   rd   )r   r   c                    t          j        |t           j                  }t          j        |t           j                  dddf                             | |          d         }|s||ddddf         z
  }n||ddddf         z   }t          j        |d|dz
            }t          j        | t           j                  ddddf                             | ||          }t          j        |t           j                  ddddf                             | ||          }	t          j        |                    d          |                    d          |	                    d          gd                                          }
||
fS )a  
        Precompute (sequence_idx, all_idx) so that out[seq, channel] = in[seq - delay[channel], channel]
        or in[seq, channel] = out[seq + delay[channel], channel] if `revert`.
        Negative sequence_idx => BOS; sequence_idx >= seq_len => PAD.
        r]   N).Nr   rW   rS   rU   )	r~   r   int32aranger   clampstackr   r   )ra   rb   rc   r   rd   delay_arraysequence_idxvalid_sequence_idx	batch_idxchannel_idxall_idxs              r    r   zDiaProcessor.build_indices  s    l=DDD |G5;???aaaHOOPSU\]]^gh 	E'+dD!!!m*DDLL'+dD!!!m*DDL"[q'A+FFLEK888D$GNNsT[]ijj	l<u{CCCD$PQPQPQMRYYZ]_fhtuu+r""$6$>$>r$B$BKDWDWXZD[D[\
 
 
 $&& 	
 W$$r   r   r   rg   c           	         | j         }|\  }}|                    |          }|                    |          }t          j        |d          \  }}}	| |||	f                             |                                           }
|dk     }|| j        d         k    }t          j        ||t          j        |||
                    }|S )a  
        Applies or reverts the delay pattern to batched audio tokens using precomputed indices,
        inserting BOS where sequence_idx < 0 and PAD where sequence_idx >= seq_len.

        Args:
            audio: audio tokens of shape [bsz, seq_len, num_channels]
            pad_token_id: the PAD token
            bos_token_id: the BOS token
            precomputed_idx: from `build_indices`

        Returns:
            final_audio: delayed or reverted audio tokens of shape [bsz, seq_len, num_channels]
        rS   rU   r   rW   )r   r   r~   unbindviewr_   rs   where)rE   r   r   rg   r   r   r   r   r   r   gathered_audiomask_bosmask_padfinal_audios                 r    r   zDiaProcessor.apply_audio_delay  s    *  /g#v..**V$$ 6;\'r5R5R5R2	%{y*<kIJOOPUPZPZP\P\]]  !#5;q>1k(L%+hP\^l:m:mnnr   )NFrJ   )F)r   r   r   __doc__feature_extractor_classtokenizer_classaudio_tokenizer_classr?   r   rL   r   r   r   r   r   r#   r   r   r   r   r   r   r   staticmethodrp   r   r   __classcell__)rC   s   @r    r8   r8   >   s         4$O&X X X X X '+(-	kC kCCcN#kC 
#kC  ~	kC
 +,kC kC kC kC` +/E E)E #3-E +,	E
 
n	E E E ET +/S S)S #3-S +,	S
 
S S S S"D .D +,D 
	D D D D* 4 4 3d5d+;&<<= 4 +,	 4  4  4  4D   %  % % %  % Cy	 %
  % 
-	. %  %  % \ %D """ " =>	"
 
" " " \" " " " "r   r8   )r   rv   pathlibr   typingr   r   audio_utilsr   r   feature_extraction_utilsr	   processing_utilsr
   r   r   r   utilsr   r   r~   	soundfiler   r   r#   r8   __all__r   r   r    <module>r      s            " " " " " " " " 9 9 9 9 9 9 9 9 4 4 4 4 4 4 U U U U U U U U U U U U ? ? ? ? ? ? ? ?  LLL     [        )    (Y Y Y Y Y> Y Y Yx 
r   