
    Pi,9                    @   d dl mZ d dlmZ d dlZd dlZd dlZd dlm	Z	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlmZ d d	lmZ d
eej                 dedej        fdZd Z G d d          Z  G d d          Z! G d de           Z"ddZ#dS )    )Path)	GeneratorN)NeuCodecDistillNeuCodec)phonemize_with_dict)split_text_into_chunksjoin_audio_chunks)defaultdict)hf_hub_download)ThreadPoolExecutorframesstridereturnc                 |   t          |           sJ | d         j        }| d         j        d d         }d}t          |           D ](\  }}||z  |j        d         z   }t	          ||          })t          j        ||          }t          j        g ||R d|i}	d}
| D ]}|j        d         }t          j        dd|dz   |          dd         }t          j        d|dz
  z
            }|	d|
|
|z   fxx         ||z  z  cc<   ||
|
|z   xx         |z  cc<   |
|z  }
|	                                dk    sJ |	|z  S )	Nr   dtyper         g      ?.)
lenr   shape	enumeratemaxnpzeroslinspaceabsmin)r   r   r   r   
total_sizeiframe	frame_end
sum_weightoutoffsetframe_lengthtweights                 _/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vieneu/core.py_linear_overlap_addr*      s   v;;;1IOE1IOCRC EJf%% 0 05QJR0	Y//

*E222J
(
3E
3:
3
3
3U
3
3CF  {2K1lQ.e<<<QrTBq3w((C&<///000FUNB0006F\11222f<222&>>a    c                 n   	 ddl }t          | d          rt          | j        d          rmt          | j        j                  dk    rPt          j        | j        j        d         j        dd          | j        j        d         _        t          d	           dS # t          $ r Y d
S t          $ r Y d
S w xY w)zHCompile codec with Triton for faster decoding (Windows/Linux compatible)r   Ndec	resblocksr   zreduce-overheadT)modedynamicu+      ✅ Triton compilation enabled for codecF)tritonhasattrr-   r   r.   torchcompileforwardprintImportError	Exception)codecr1   s     r)   _compile_codec_with_tritonr:   0   s    5%   	EWUY%D%D 	E59&''!++16I'*2* 2 2 2	#A&.
 CDDDt   uu   uus   BB 
B4'	B43B4c                   P   e Zd ZdZ	 	 	 	 	 d8dZd Zd Zd	 Zd
 Zde	fdZ
d9dZd Zd9de	de	fdZd Zd:dZd;defdZd9de	fdZd Zd9de	fdZde	ez  fdZd<d#e	d$e	ez  d%ej        ej        z  d&e	d'ed(ed)ed*ed+ed,ed-ej        fd.Zd=d#e	d%ej        ej        z  d&e	d'ed*ed+ed,ed-eej        ddf         fd/Zd0e	fd1Z d%e!e         d&e	d2e	d-e!e         fd3Z"d>d4e!e         d+ed,ed-e	fd5Z#d>d%e!e         d&e	d2e	d+ed,ed-e	fd6Z$d>d%ej        d&e	d2e	d+ed,ed-eej        ddf         fd7Z%dS )?	VieNeuTTSa   
    Standard VieNeu-TTS implementation.
    
    Supports:
    - PyTorch + Transformers backend (CPU/GPU)
    - GGUF quantized models via llama-cpp-python (CPU optimized)
    
    Use this for:
    - CPU-only environments
    - Standard PyTorch workflows
    - GGUF quantized models
    "pnnbao-ump/VieNeu-TTS-0.3B-q4-ggufcpuneuphonic/distill-neucodecNc                 :   d| _         d| _        d| _        d| _        d| _        d| _        d| _        | j        | j        z  | _        d| _        d| _	        d	| _
        |r|                     |||           |                     ||           t          t                    j        d
z  | _        i | _        d	| _        |                     ||           	 dd	l}|                                | _        t/          d           d	S # t0          t2          f$ r d	| _        Y d	S w xY w)a  
        Initialize VieNeu-TTS.
        
        Args:
            backbone_repo: Model repository or path to GGUF file
            backbone_device: Device for backbone ('cpu', 'cuda', 'gpu')
            codec_repo: Codec repository
            codec_device: Device for codec
        ]       r      
   d   FNassetsr   .      🔒 Audio watermarking initialized (Perth))sample_ratemax_context
hop_lengthstreaming_overlap_framesstreaming_frames_per_chunkstreaming_lookforwardstreaming_lookbackstreaming_stride_samples_is_quantized_model_is_onnx_codec	tokenizer_load_backbone_load_codecr   __file__parent
assets_dir_preset_voices_default_voice_load_voicesperthPerthImplicitWatermarkerwatermarkerr6   r7   AttributeError)selfbackbone_repobackbone_device
codec_repocodec_devicehf_tokenr\   s          r)   __init__zVieNeuTTS.__init__Y   sF   & "()%*,'%'""%(,(G$/(Y% $) #   	JIII\222 x../(: "-222	$LLL$==??DBCCCCC^, 	$ 	$ 	$#D	$s   ,C> >DDc                     | S N r`   s    r)   	__enter__zVieNeuTTS.__enter__   s    r+   c                 .    |                                   d S rh   close)r`   exc_typeexc_valexc_tbs       r)   __exit__zVieNeuTTS.__exit__   s    

r+   c                 @    	 |                                   dS #  Y dS xY w)z+Finalizer to ensure resources are released.Nrm   rj   s    r)   __del__zVieNeuTTS.__del__   s(    	JJLLLLL	DDs    c                    t                                          dd          }t                                          dd          }	 t          | d          rW| j        Pt	          | dd          r8	 t	          | j        dd          }t          |          r
 |             n#  Y nxY wd| _        t          | d          r| j        d| _        ||                                 |t          |d	          r|j        ~t          t	          |j        d
d                    r]|j        	                                rFt          t	          |j        dd                    r%|j        
                                 dS dS dS dS dS dS dS #  Y dS xY w)z#Explicitly release model resources.gcNr3   backbonerQ   Frn   r9   cudais_availableempty_cache)globalsgetr2   rw   getattrcallabler9   collectrx   ry   rz   )r`   _gc_torchclose_fns       r)   rn   zVieNeuTTS.close   s    iimmD$''w--	tZ(( 
%T]-F4!6>> #*4='4#H#H#H-- '$HJJJ $tW%% "$**@!
 !66** 6v{/F^T J JKK 6PVP[PhPhPjPj 6#GFK$M$MNN 6"K3355555	 "!6 6/F/F6 6 6 66 6	DDs%   (F  //B F  B#!CF   Foutput_pathc                 F    ddl }|                    ||| j                   dS )z
        Save audio to file.
        
        Args:
            audio: Audio waveform
            output_path: Path to save the audio file
        r   N)	soundfilewriterI   )r`   audior   sfs       r)   savezVieNeuTTS.save   s/     	
eT%566666r+   c           
         |dk    r4t           j        j                                        st	          d           d}t	          d| d| d           |                                                    d          sd|                                v rj	 dd	lm} n"# t          $ r}t          d
          |d }~ww xY w|
                    |dd|dk    rdnd| j        d|dk    rdnd|          | _        d| _        d S ddlm}m} |
                    ||          | _        |
                    ||                              t          j        |                    | _        d S )Nmpsz/Warning: MPS not available, falling back to CPUr>   zLoading backbone from:  on  ...ggufr   )Llamau   Failed to import `llama_cpp`. Xem hướng dẫn cài đặt llama_cpp_python phiên bản tối thiểu 0.3.16 tại: https://llama-cpp-python.readthedocs.io/en/latest/z*.ggufFgpur   T)repo_idfilenameverbosen_gpu_layersn_ctxmlock
flash_attntoken)AutoTokenizerAutoModelForCausalLMr   )r3   backendsr   ry   r6   lowerendswith	llama_cppr   r7   from_pretrainedrJ   rw   rQ   transformersr   r   rS   todevice)r`   ra   rb   re   r   er   r   s           r)   rT   zVieNeuTTS._load_backbone   s   e##>%2244 (GHHH"'PPP?PPPQQQ  ))&11 	V}?R?R?T?T5T5T+++++++   !b  
 "11%!#2e#;#;RR&#2e#;#;44 2 	 	DM (,D$$$ IHHHHHHH*::=PX:YYDN0@@V^@__bb_-- DMMMs   B 
B5 B00B5c                    |dk    r4t           j        j                                        st	          d           d}t	          d| d| d           |xdk    rH t          j        |          | _        | j                                        	                    |           d S xdk    rH t          j        |          | _        | j                                        	                    |           d S d	k    rb|dk    rt          d
          	 ddlm} n"# t          $ r}t          d          |d }~ww xY w|                    |          | _        d| _        d S 	 t          d|           )Nr   z9Warning: MPS not available for codec, falling back to CPUr>   Loading codec from: r   r   neuphonic/neucodecr?   $neuphonic/neucodec-onnx-decoder-int8z(Onnx decoder only currently runs on CPU.r   NeuCodecOnnxDecoderzeFailed to import the onnx decoder.Ensure you have onnxruntime installed as well as neucodec >= 0.0.4.TUnsupported codec repository: )r3   r   r   ry   r6   r   r   r9   evalr   r   
ValueErrorneucodecr   r7   rR   )r`   rc   rd   r   r   s        r)   rU   zVieNeuTTS._load_codec   s   5  >%2244 %QRRR$GZGG\GGGHHH%%%%%%5jAA

!!$$\22222-----,<ZHH

!!$$\222227775(($%OPPP<<<<<<<"   %^  
 1@@LL
&*### !N*!N!NOOOs   	D 
D/D**D/lora_repo_idre   c                 B   | j         rt          d          	 ddlm} n!# t          $ r}t	          d          d}~ww xY wt          d|            t          | d          r| j        sd| _        d| _        | j        r| 	                                 	 |
                    | j        ||	          | _        d
| _        || _        |                     ||d
           t          d|            d
S # t          $ r%}t          dt          |                     |d}~ww xY w)z$
        Load LoRA adapter.
        zCLoRA not supported for GGUF quantized models. Use PyTorch backbone.r   )	PeftModelz>PEFT library required for LoRA. Install with: pip install peftNu    🎯 Loading LoRA adapter from: _lora_loadedFr   Tclear_existingu      ✅ LoRA adapter loaded: zFailed to load LoRA adapter: )rQ   NotImplementedErrorpeftr   r7   r6   r2   r   _current_lora_repounload_lora_adapterr   rw   r[   r8   RuntimeErrorstr)r`   r   re   r   r   s        r)   load_lora_adapterzVieNeuTTS.load_lora_adapter  s    # 	m%&klll	`&&&&&&& 	` 	` 	`^___	` 	???@@@ t^,, 	&D4E 	&&*D# %D  	'$$&&&	P%55 6  DM
 !%D&2D# lHTJJJ???@@@4 	P 	P 	PGs1vvGGHHaO	Ps)    
=8=AC/ /
D9 DDc                    t          | d          r| j        sdS t          d| j                    	 | j                                        | _        d| _        d| _        t          j                     t          j	        
                                rt          j	                                         t          d           dS # t          $ r}t          d|            Y d}~dS d}~ww xY w)zb
        Unload LoRA adapter and restore original backbone weights using PEFT's unload().
        r   Fu       🔄 Unloading LoRA adapter: Nu7      ✅ LoRA adapter unloaded, original weights restoredTu      ⚠️ Error during unload: )r2   r   r6   r   rw   unloadrv   r   r3   rx   ry   rz   r8   )r`   r   s     r)   r   zVieNeuTTS.unload_lora_adapter6  s     t^,, 	D4E 	5J1HJJKKK	 M0022DM %D&*D# JLLLz&&(( )
&&(((KLLL4 	 	 	7A7788855555	s   B
B> >
C%C  C%Fc                 P   |sdS t          |          }|                                r|                                r|dz  }n
|j        dz  }|                                r|                     ||           dS |r| j                                         t          d| d           t          d           dS |r| j                                         	 |                     ||           dS # t          $ r/}t          d| d|            t          d           Y d}~dS d}~ww xY w)	1Unified voice loading for Local and Remote paths.Nvoices.jsonr   u*      ⚠️ Validation Warning: Local path 'z' missing 'voices.json'.u4      ⚠️ Falling back to Custom Voice Cloning mode.u4      ⚠️ Warning: Could not load voices from repo 'z': )
r   existsis_dirrW   _load_voices_from_filerY   clearr6   _load_voices_from_repor8   )r`   ra   re   r   path_obj	json_pathr   s          r)   r[   zVieNeuTTS._load_voicesP  s~    	F&&?? 	O   <$}4		$Om;	!! O++In+UUUUU! 1(..000j=jjjkkkMNNNNN  ,#))+++O++M8DDDDD O O Ob]bb_`bbcccMNNNNNNNNNOs   C, ,
D%6$D  D%	file_pathc                    	 t          |dd          5 }t          j        |          }ddd           n# 1 swxY w Y   d|v rw|r(| j                                         t          d           | j                            |d                    t          dt          |d                    d|j                    d	|v r|d	         r|d	         | _	        dS dS dS # t          $ r }t          d
| d|            Y d}~dS d}~ww xY w)#Load voices from a local JSON file.rutf-8encodingNpresetsu/      🧹 Cleared existing voices for replacement      📢 Loaded  voices from default_voice%      ⚠️ Failed to load voices from : )openjsonloadrY   r   r6   updater   namerZ   r8   )r`   r   r   fdatar   s         r)   r   z VieNeuTTS._load_voices_from_fileo  s   	Liw777 $1y||$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ D  ! N'--///LMMM #**4	?;;;[DO(<(<[[9>[[\\\ $&&4+@&&*?&;### '&&&  	L 	L 	LJ)JJqJJKKKKKKKKK	Ls3   C 5C 9C 9BC 
D$C??Dr   c                 \   d}	 t          |d|d          }n[# t          $ rN}t          d           	 t          |d|dd          }t          d           n# t          $ r Y nw xY wY d}~nd}~ww xY w|r$|                     t	          |                     dS t          d	| d
           dS z_Download and load voices.json from a HuggingFace repo. STRICT MODE with Offline Cache Fallback.Nr   model)r   r   r   	repo_typeuE      ⚠️ Network check failed for voices.json. Trying local cache...T)r   r   r   r   local_files_onlyu      ✅ Using cached voices.jsonu      ⚠️ Warning: Repository 'z>' is missing 'voices.json'. Falling back to Custom Voice mode.r   r8   r6   r   r   r`   r   re   voices_filer   s        r)   r   z VieNeuTTS._load_voices_from_repo     	)&!	  KK  	 	 	Z[[[-#*"%%)   89999   	   	}''[(9(9:::::{G{{{|||||7    
A0A+#AA+
A# A+"A##A++A0c                 H    d | j                                         D             S )2List available preset voices as (description, id).c                     g | ]A\  }}t          |t                    r|                    d |          nt          |          |fBS description
isinstancedictr|   r   .0kvs      r)   
<listcomp>z0VieNeuTTS.list_preset_voices.<locals>.<listcomp>  Y     
 
 
1 )31d(;(;GQUU=!$$$QK
 
 
r+   rY   itemsrj   s    r)   list_preset_voiceszVieNeuTTS.list_preset_voices  2    
 
+1133
 
 
 	
r+   
voice_namec                    |A| j         }|8| j        r"t          t          | j                            }nt	          d          || j        vr't	          d| d|                                            | j        |         }|d         }t          |t                    r t          j	        |t          j
                  }||d         dS )	z
        Get reference codes and text for a preset voice.
        
        Args:
            voice_name: Name of voice. If None, uses default_voice.
            
        Returns:
            dict: { 'codes': torch.Tensor, 'text': str }
        N2No voice specified and no preset voices available.Voice '' not found. Available: codesr   textr   r   rZ   rY   nextiterr   r   r   listr3   tensorlongr`   r   
voice_datar   s       r)   get_preset_voicezVieNeuTTS.get_preset_voice  s     ,J!& [!%d4+>&?&?!@!@JJ$%YZZZT000fzff4KbKbKdKdffggg(4
 7#eT"" 	:Lej999E
6(:;;;r+   ref_audio_pathc                    t          j        |dd          \  }}t          j        |                                                              d                              d          }t          j                    5  | j                            |          	                    d          	                    d          }ddd           n# 1 swxY w Y   |S zEncode reference audio to codesi>  T)srmonor   )audio_or_pathN
librosar   r3   
from_numpyfloat	unsqueezeno_gradr9   encode_codesqueezer`   r	  wav_
wav_tensor	ref_codess         r)   encode_referencezVieNeuTTS.encode_reference     nTBBBQ%c**0022<<Q??II!LL
]__ 	_ 	_
..Z.HHPPQRSS[[\]^^I	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_   :ACCC   333333?              ?2   r   	ref_audior  ref_text	max_chars	silence_pcrossfade_pvoicetemperaturetop_kr   c                    |,|                     d|          }|                     d|          }|||                     |          }nr| j        rk||gt          d| j                    	 |                     d          }|d         }|d         }n)# t
          $ r}t          d|            Y d}~nd}~ww xY w||t          d          t          ||          }|s t          j	        g t          j
                  S g }|D ]{}| j        r|                     ||||	|
          }n.|                     |||          }|                     ||	|
          }|                     |          }|                    |           |t#          || j        ||          }| j        r!| j                            || j        	          }|S )
a  
        Perform inference to generate speech from text using the TTS model and reference audio.
        Automatically splits long text into chunks.

        Args:
            text (str): Input text to be converted to speech.
            ref_audio (str | Path): Path to reference audio file for cloning.
            ref_codes (np.ndarray | torch.tensor): Encoded reference.
            ref_text (str): Reference text for reference audio.
            max_chars (int): Maximum characters per chunk for splitting.
            silence_p (float): Seconds of silence to pad between chunks.
            crossfade_p (float): Seconds of crossfade between chunks (ignored if silence_p > 0).
            voice (dict): Optional dictionary containing 'codes' and 'text' (overrides ref_codes/ref_text).
            temperature (float): Sampling temperature (default 1.0).
            top_k (int): Top-k sampling (default 50).
        Returns:
            np.ndarray: Generated speech waveform.
        Nr   r   6      ⚠️ No reference provided. Using default voice: ,Warning: Failed to auto-load default voice: DMust provide either 'voice' dict or both 'ref_codes' and 'ref_text'.r&  r   rI   )r|   r  rZ   r6   r  r8   r   r   r   arrayfloat32rQ   _infer_ggml_apply_chat_template_infer_torch_decodeappendr	   rI   r^   apply_watermark)r`   r   r$  r  r%  r&  r'  r(  r)  r*  r+  r  r   chunksall_wavschunk
output_str
prompt_idsr  	final_wavs                       r)   inferzVieNeuTTS.infer  s(   & 		'955Iyy22H  Y%6--i88II  	Ji&78;K`4K^``aaaJ!22488
&w/	%f- J J JHQHHIIIIIIIIJ  0deee (	BBB 	28Bbj1111 
	! 
	!E' O!--i5+W\]]

!66y(ERR
!..z;NN
 ,,z**COOC     &h0@)[YY	  	b(88PTP`8aaI   ,%B 
B8B33B8c              #     K   |-|                     d|          }|                     d|          }nr| j        rk||gt          d| j                    	 |                     d          }|d         }|d         }n)# t          $ r}	t          d|	            Y d}	~	nd}	~	ww xY w||t          d          t          ||          }
|
D ]}| j        r |                     |||||          E d{V  )| 	                    |||          }| 
                    |||          }|                     |          }| j        r!| j                            || j                  }|V  dS )	a  
        Perform streaming inference to generate speech from text using the TTS model and reference audio.
        Automatically splits long text into chunks and streams them.

        Args:
            text (str): Input text to be converted to speech.
            ref_codes (np.ndarray | torch.tensor): Encoded reference.
            ref_text (str): Reference text for reference audio.
            max_chars (int): Maximum characters per chunk for splitting.
            voice (dict): Optional dictionary containing 'codes' and 'text'.
            temperature (float): Sampling temperature.
            top_k (int): Top-k sampling.
        Yields:
            np.ndarray: Generated speech waveform.
        Nr   r   r-  r.  r/  r0  r1  )r|   rZ   r6   r  r8   r   r   rQ   _infer_stream_ggmlr5  r6  r7  r^   r9  rI   )r`   r   r  r%  r&  r)  r*  r+  r  r   r:  r<  r>  r=  r  s                  r)   infer_streamzVieNeuTTS.infer_stream  s       		'955Iyy22HH  	Ji&78;K`4K^``aaaJ!22488
&w/	%f- J J JHQHHIIIIIIIIJ  0deee'	BBB 
	 
	E' 	229h{\abbbbbbbbbb "66y(ERR
!..z;NN
ll:..# ^*::3DL\:]]C				
	 
	   %A; ;
B!BB!r   c                    d t          j        d|          D             }t          |          dk    rt          d          | j        rZt          j        |t
          j                  t
          j        t
          j        ddf         }| j	        
                    |          }nt          j                    5  t          j        |t          j                  ddddf                             | j	        j                  }| j	        
                    |                                                                          }ddd           n# 1 swxY w Y   |ddddf         S )z'Decode speech tokens to audio waveform.c                 ,    g | ]}t          |          S ri   intr   nums     r)   r   z%VieNeuTTS._decode.<locals>.<listcomp>I      SSS3c#hhSSSr+   <\|speech_(\d+)\|>r   u   No valid speech tokens found in the output. Nếu gặp lỗi này, hãy tạo issue trên github repo hoặc thông báo với chúng tôi tại: https://discord.com/invite/yJt8kzjzWZr   Nrefindallr   r   rR   r   r2  int32newaxisr9   decode_coder3   r  r  r  r   r   r>   numpyr`   r   
speech_idsrecons       r)   r7  zVieNeuTTS._decodeF  s    TS"*5JE*R*RSSS
z??a K  
  		DHZrx888RZQRQRQR9RSEJ**511EE  D DZuzBBB4qqq=QTTJ%  
..u5599;;AACC	D D D D D D D D D D D D D D D Q111W~   6BE		EE
input_textc                 
   t          |          dz   t          |          z   }| j                            d          }| j                            d          }| j                            d          }| j                            d          }| j                            d          }| j                            |d          }	d	}
| j                            |
          }|                    |          }|d |         |gz   |	z   |gz   ||d
z   d          z   }|                    |          }d                    d |D                       }| j                            |d          }|d |         |gz   t          |          z   }|S )N z<|SPEECH_REPLACE|>z<|SPEECH_GENERATION_START|>z<|TEXT_REPLACE|>z<|TEXT_PROMPT_START|>z<|TEXT_PROMPT_END|>Fadd_special_tokenszNuser: Convert the text to speech:<|TEXT_REPLACE|>
assistant:<|SPEECH_REPLACE|>r    c                     g | ]}d | d	S z	<|speech_z|>ri   )r   r    s     r)   r   z2VieNeuTTS._apply_chat_template.<locals>.<listcomp>u  s$    BBB1....BBBr+   )r   rS   convert_tokens_to_idsencodeindexjoinr  )r`   r  r%  rY  speech_replacespeech_gen_starttext_replacetext_prompt_starttext_prompt_end	input_idschatidstext_replace_idxspeech_replace_idx	codes_strr   s                   r)   r5  zVieNeuTTS._apply_chat_template^  s   (22S8;Nz;Z;ZZ
==>RSS>??@]^^~;;<NOO N@@AXYY.>>?TUUN))*)OO	dn##D))99\22!!!" !"   "Q&(()	* 	 !YY~66GGBB	BBBCC	%%iE%JJ%%%&*:);;d5kkI
r+   r>  c                 8   t          j        |                              d                              | j        j                  }| j                            d          }t          j                    5  | j        	                    || j
        |d||dd          }d d d            n# 1 swxY w Y   |j        d         }| j                            |d|d f                                                                                                         d          }|S )	Nr   <|SPEECH_GENERATION_END|>Tr#  )
max_lengtheos_token_id	do_sampler*  r+  	use_cachemin_new_tokensr   Fr\  )r3   r  r  r   rw   r   rS   ra  r  generaterJ   r   decoder>   rT  tolist)	r`   r>  r*  r+  prompt_tensorspeech_end_idoutput_tokensinput_lengthr=  s	            r)   r6  zVieNeuTTS._infer_torch{  sK   Z00::1==@@AUVV<<=XYY]__ 
	 
	 M22+*'! 3 	 	M
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 %*2.^**!\]]*+//117799@@BBW\ + 
 

 s   2(B&&B*-B*c                     t          |          }t          |          }d                    d |D                       }d| d| d| }|                     || j        ||dg          }|d         d	         d
         }	|	S )Nr^  c                     g | ]}d | d	S r`  ri   r   idxs     r)   r   z)VieNeuTTS._infer_ggml.<locals>.<listcomp>  $    FFFS0000FFFr+   6user: Convert the text to speech:<|TEXT_PROMPT_START|>r[  9<|TEXT_PROMPT_END|>
assistant:<|SPEECH_GENERATION_START|>rq  )
max_tokensr*  r+  stopchoicesr   r   )r   rd  rw   rJ   )
r`   r  r%  rY  r*  r+  ro  promptoutputr=  s
             r)   r4  zVieNeuTTS._infer_ggml  s    &x00(44
GGFFIFFFGG	UX U UPZ U UIRU U 	 '#-.  
 
 I&q)&1
r+   c              #   B  K   t          |          }t          |          }d                    d |D                       }d| d| d| }g }d |D             }	d}
t          |          }|                     || j        ||dgd	
          D ]y}|d         d         d         }|	                    |           t          |	|d                    | j        | j        z   k    r$t          || j	        z
  | j
        z
  d          }|| j        z   | j        z   | j
        z   }||z
  | j        z  }|| j        d| j
        z  z   | j        z  z   }|	||         }|                     d                    |                    }| j        r!| j                            || j                  }|||         }|                    |           t!          || j                  }t          |          | j        z  }||
|         }|}
|| j        z  }|V  {t          |	          |z
  }t          |	          |k    rt          t          |	          | j	        | j
        z   |z   z
  d          }t          |	          |z
  |z
  | j
        z
  | j        z  }|	|d          }|                     d                    |                    }| j        r!| j                            || j                  }||d          }|                    |           t!          || j                  }||
d          }|V  d S d S )Nr^  c                     g | ]}d | d	S r`  ri   r  s     r)   r   z0VieNeuTTS._infer_stream_ggml.<locals>.<listcomp>  r  r+   r  r[  r  c                     g | ]}d | d	S r`  ri   r  s     r)   r   z0VieNeuTTS._infer_stream_ggml.<locals>.<listcomp>  s$    !K!K!K#"5c"5"5"5!K!K!Kr+   r   rq  T)r  r*  r+  r  streamr  r   r   r1  r   )r   rd  r   rw   rJ   r8  rM   rN   r   rO   rL   rK   r7  r^   r9  rI   r*   rP   )r`   r  r%  rY  r*  r+  ro  r  audio_cachetoken_cachen_decoded_samplesn_decoded_tokensitemr=  tokens_start
tokens_endsample_start
sample_end
curr_codesrW  processed_reconnew_samples_endremaining_tokenss                          r)   rC  zVieNeuTTS._infer_stream_ggml  s     &x00(44
GGFFIFFFGG	UX U UPZ U UIRU U 	
 )+!K!K!K!K!K!" #IMM'#-. " 
 
 4	& 4	&D i+F3Jz***;/00122d6UX\Xr6rrr  #$-.34 	    %5601 34  %|3O $ !6T=Z9ZZ^b^mmn  )j)@A
RWWZ%8%899# b ,<<UPTP`<aaEl:56""5))) #6(E# # # #&k"2"2T5R"R"1%o5# %4! D$CC %%%% {++.>>{...K  *T-JJM]]_ L K  "# /0  L %\]]3JLL!4!455E ^(88DL\8]],--(Eu%%%1+dFcdddO-.?.@.@AO!!!!!!/ /.r+   )r=   r>   r?   r>   Nrh   )NF)F	NNNr  r   r!  Nr"  r#  NNr  Nr"  r#  )r"  r#  )&__name__
__module____qualname____doc__rf   rk   rr   rt   rn   r   r   rT   rU   r   r   r[   r   r   r   r   r  r  r   ndarrayr3   TensorrI  r  r   r@  r   rD  r7  r  r5  r6  r4  rC  ri   r+   r)   r<   r<   K   se         ;/4$ 4$ 4$ 4$l      ! ! !F	7s 	7 	7 	7 	7" " " "HP P P<(P (Pc (PS (P (P (P (PT  4O O O O>L L L L L L,} }c } } } }@
 
 
< <3 < < < <@sTz    B B# B#* B
UZUaHa Btw B  LO B  bg B  }B B  QU B  kp B  B B  LN  LV B B B BH, , ,el1J ,]` ,tw ,  GK ,  af ,  ux ,  BK  LN  LV  X\  ^b  Lb  Bc , , , ,\S    0d3i 3 TW \`ad\e    : tCy u SV `c    ( T#Y # 3 ]b qt   B    &^" ^"EL ^"C ^"UX ^"gl ^"{~ ^"  IR  SU  S]  _c  ei  Si  Ij ^" ^" ^" ^" ^" ^"r+   r<   c                      e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 d;dZd<dZdefdZd<defdZ	d Z
d<defdZd<dZd Zd Zdeez  fdZdeez  defdZd<dededefdZdededefd Zd!efd"Zd#ee         ded$ed%efd&Zd=ded,eez  d#ej        ej        z  ded-ed.ed/ed0ed1ed2ed%ej        fd3Zd>d4ee         d#ej        ej        z  ded5ed0ed1ed2ed%eej                 fd6Zd?ded#ej        ej        z  ded-ed0ed1ed2ed%eej        d
d
f         fd7Z ded#ej        ej        z  ded%eej        d
d
f         fd8Z!d9 Z"d%efd:Z#d
S )@FastVieNeuTTSzB
    GPU-optimized VieNeu-TTS using LMDeploy TurbomindEngine.
    pnnbao-ump/VieNeu-TTSrx   r?   333333?r   Tr      Nc                 H   |dk    r$|                     d          st          d          d| _        d| _        d| _        d| _        d| _        d	| _        d| _        | j        | j        z  | _	        |
| _
        i | _        t          t                    | _        d
| _        d
| _        |                     ||||||           |                     |||	           t'          t(                    j        dz  | _        i | _        d| _        |                     ||           	 ddl}|                                | _        t;          d           n# t<          t>          f$ r
 d| _        Y nw xY w|                                   t;          d           t;          d| j
         d           dS )a  
        Initialize FastVieNeuTTS with LMDeploy backend and optimizations.
        
        Args:
            backbone_repo: Model repository
            backbone_device: Device for backbone (must be CUDA)
            codec_repo: Codec repository
            codec_device: Device for codec
            memory_util: GPU memory utilization (0.0-1.0)
            tp: Tensor parallel size for multi-GPU
            enable_prefix_caching: Enable prefix caching for faster batch processing
            quant_policy: KV cache quantization (0=off, 8=int8, 4=int4)
            enable_triton: Enable Triton compilation for codec
            max_batch_size: Maximum batch size for inference (prevent GPU overload)
        rx   zcuda:z%LMDeploy backend requires CUDA devicerA   rB   rC   r   r#     FrG   Nr   rH   u9   ✅ FastVieNeuTTS with optimizations loaded successfully!z   Max batch size: z% (adjustable to prevent GPU overload))!
startswithr   rI   rJ   rK   rL   rM   rN   rO   rP   max_batch_size
_ref_cacher
   r   stored_dictrR   _triton_enabled_load_backbone_lmdeployrU   r   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r6   r7   r_   _warmup_model)r`   ra   rb   rc   rd   memory_utiltpenable_prefix_cachingquant_policyenable_tritonr  re   r\   s                r)   rf   zFastVieNeuTTS.__init__  s   < f$$_-G-G-P-P$DEEE "()%*,'%&""$(,(G$/(Y%,&t,, $$ 	$$]KEZ\hjrsss\=AAA x../(: " 	-222	$LLL$==??DBCCCC^, 	$ 	$ 	$#D	$ 	IJJJ^D$7^^^_____s   ,E	 	E$#E$c                 H   |sdS t          |          }|                                rd|                                r|dz  }n
|j        dz  }|                                r|                     |           dS t          d| d           dS |                     ||           dS )r   Nr   u      ⚠️ Warning: Local path 'z;' missing 'voices.json'. Falling back to Custom Voice mode.)r   r   r   rW   r   r6   r   )r`   ra   re   r   r   s        r)   r[   zFastVieNeuTTS._load_voices[  s     	F&&?? 	A   <$}4		$Om;	!! D++I66666  C  C  C  C  D  D  D  D  D ''x@@@@@r+   r   c                    	 t          |dd          5 }t          j        |          }ddd           n# 1 swxY w Y   d|v rM| j                            |d                    t          dt          |d                    d|j                    d|v r|d         r|d         | _        dS dS dS # t          $ r }t          d	| d
|            Y d}~dS d}~ww xY w)r   r   r   r   Nr   r   r   r   r   r   )
r   r   r   rY   r   r6   r   r   rZ   r8   )r`   r   r   r   r   s        r)   r   z$FastVieNeuTTS._load_voices_from_filep  s]   	Liw777 $1y||$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ D  #**4	?;;;[DO(<(<[[9>[[\\\$&&4+@&&*?&;### '&&&  	L 	L 	LJ)JJqJJKKKKKKKKK	Ls3   B0 5B0 9B0 9A-B0 0
C:CCr   c                 \   d}	 t          |d|d          }n[# t          $ rN}t          d           	 t          |d|dd          }t          d           n# t          $ r Y nw xY wY d}~nd}~ww xY w|r$|                     t	          |                     dS t          d	| d
           dS r   r   r   s        r)   r   z$FastVieNeuTTS._load_voices_from_repo  r   r   c                 H    d | j                                         D             S )r   c                     g | ]A\  }}t          |t                    r|                    d |          nt          |          |fBS r   r   r   s      r)   r   z4FastVieNeuTTS.list_preset_voices.<locals>.<listcomp>  r   r+   r   rj   s    r)   r   z FastVieNeuTTS.list_preset_voices  r   r+   r   c                    |A| j         }|8| j        r"t          t          | j                            }nt	          d          || j        vr't	          d| d|                                            | j        |         }|d         }t          |t                    r t          j	        |t          j
                  }||d         dS )	z0Get reference codes and text for a preset voice.Nr   r   r   r   r   r   r   r   r  s       r)   r  zFastVieNeuTTS.get_preset_voice  s    ,J!& [!%d4+>&?&?!@!@JJ$%YZZZT000fzff4KbKbKdKdffggg(4
7#eT"" 	:Lej999E
6(:;;;r+   c                    t          d|            |rddl}||j        d<   t          d           	 ddlm}m}	m}
 n"# t          $ r}t          d          |d}~ww xY w |	|||d|	          } |||
          | _         |
dddddd          | _	        t          d           t          d|            t          d|            t          d|            t          d| d|dk    rdnd d           dS )z.Load backbone using LMDeploy's TurbomindEnginez%Loading backbone with LMDeploy from: r   NHF_TOKENu-      🔑 Set HF_TOKEN for private model access)pipelineTurbomindEngineConfigGenerationConfigu   Failed to import `lmdeploy`. Bạn cần cài đặt phiên bản hỗ trợ GPU bằng lệnh: pip install vieneu[gpu]. 
Xem thêm hướng dẫn tại: https://github.com/pnnbao97/VieNeu-TTSbfloat16)cache_max_entry_countr  r  r   r  )backend_configgffffff?r#  r"  rB   T(   )top_pr+  r*  max_new_tokensrt  rv  z'   LMDeploy TurbomindEngine initializedz   - Memory util: z   - Tensor Parallel: z   - Prefix caching: z   - KV quant: z (EnabledDisabled))
r6   osenvironlmdeployr  r  r  r7   rw   
gen_config)r`   repor  r  r  r  re   r  r  r  r  r   r  s                r)   r  z%FastVieNeuTTS._load_backbone_lmdeploy  s   <d<<===  	CIII%-BJz"ABBB	RRRRRRRRRRR 	 	 	Y  		 /."-"7%
 
 
 !nEEE**
 
 
 	89990;00111+r++,,,=&;==>>>```\A=M=M		S]```aaaaas   
> 
AAAc                    t          d| d|            |xdk    rG t          j        |          | _        | j                                                            |           nxdk    rG t          j        |          | _        | j                                                            |           nydk    ra|dk    rt          d          	 dd	lm	} n"# t          $ r}t          d
          |d}~ww xY w|                    |          | _        d| _        n	 t          d|           |r(| j        s#|dk    rt          | j                  | _        dS dS dS dS )z+Load codec with optional Triton compilationr   r   r   r?   r   r>   zONNX decoder only runs on CPUr   r   zVFailed to import ONNX decoder. Ensure onnxruntime and neucodec >= 0.0.4 are installed.NTr   )r6   r   r   r9   r   r   r   r   r   r   r7   rR   r:   r  )r`   rc   rd   r  r   r   s         r)   rU   zFastVieNeuTTS._load_codec  s   CZCC\CCDDD%%%%%%5jAA

!!$$\2222-----,<ZHH

!!$$\22227775(($%DEEE<<<<<<<"   %R  
 1@@LL
&*## !N*!N!NOOO 	J!4 	J9N9N#=dj#I#ID   	J 	J 	J 	J9N9Ns   C 
C2C--C2c                 :   t          d           	 t          t          d                    }|                     |dd          }|                     |g| j        d          }t          d           d	S # t          $ r}t          d|            Y d	}~d	S d	}~ww xY w)
z7Warmup inference pipeline to reduce first-token latencyu   🔥 Warming up model...rE   warmuptestFr  do_preprocessu      ✅ Warmup completeu(      ⚠️ Warmup failed (non-critical): N)r6   r  range_format_promptrw   r  r8   )r`   dummy_codesdummy_promptr  r   s        r)   r  zFastVieNeuTTS._warmup_model  s    ()))	BuRyy//K..{HfMML|nX]^^A*+++++ 	B 	B 	B@Q@@AAAAAAAAA	Bs   A A3 3
B=BBr	  c                    t          j        |dd          \  }}t          j        |                                                              d                              d          }t          j                    5  | j                            |          	                    d          	                    d          }ddd           n# 1 swxY w Y   |S r  r  r  s         r)   r  zFastVieNeuTTS.encode_reference  r  r  
audio_pathr   c                 6    |                      |          }||dS )a&  
        Create a new custom voice from reference audio.
        
        Args:
            audio_path: Path to the reference audio file
            text: The exact transcript of the reference audio
            
        Returns:
            dict: { 'codes': torch.Tensor, 'text': str }
        r   )r  )r`   r  r   r  s       r)   clone_voicezFastVieNeuTTS.clone_voice  s%     ))*55	"D111r+   r%  c                     | d| }|| j         vr"|                     |          }||d| j         |<   | j         |         d         S )aF  
        Get or create cached reference codes.
        
        Args:
            voice_name: Unique identifier for this voice
            audio_path: Path to reference audio
            ref_text: Optional reference text (stored with codes)
            
        Returns:
            ref_codes: Encoded reference codes
        r  )r   r%  r   )r  r  )r`   r   r  r%  	cache_keyr  s         r)   get_cached_referencez"FastVieNeuTTS.get_cached_reference%  sd     "00J00	DO++--j99I"$* *DOI&
 y)'22r+   user_id
audio_filec                 t   |                      |          }t          |t          j                  r&|                                                                }t          |t          j                  r&|                                	                                }|| j
        |          d<   || j
        |          d<   |S )a/  
        Add a speaker to the stored dictionary for easy access.
        
        Args:
            user_id: Unique user ID
            audio_file: Reference audio file path
            ref_text: Reference text
            
        Returns:
            user_id: The user ID for use in streaming
        r   r%  )r  r   r3   r  r>   rT  r   r  flattenry  r  )r`   r  r  r%  r   s        r)   add_speakerzFastVieNeuTTS.add_speaker<  s     %%j11eU\** 	(IIKK%%''EeRZ(( 	-MMOO**,,E27G&w/5=G&z2r+   r   c                    d t          j        d|          D             }t          |          dk    rt          d          | j        rZt          j        |t
          j                  t
          j        t
          j        ddf         }| j	        
                    |          }nt          j                    5  t          j        |t          j                  ddddf                             | j	        j                  }| j	        
                    |                                                                          }ddd           n# 1 swxY w Y   |ddddf         S )z&Decode speech tokens to audio waveformc                 ,    g | ]}t          |          S ri   rH  rJ  s     r)   r   z)FastVieNeuTTS._decode.<locals>.<listcomp>V  rL  r+   rM  r   u  No valid speech tokens found in the output. Lỗi này có thể do GPU của bạn không hỗ trợ định dạng bfloat16 (ví dụ: dòng T4, RTX 20-series) dẫn đến sai số khi tính toán. Bạn hãy thử chuyển sang dùng phiên bản VieNeu-TTS-0.3B nếu vẫn muốn dùng LmDeploy hoặc bỏ chọn 'LMDeploy' trong Tùy chọn nâng cao. Nếu vẫn gặp lỗi này, hãy thông báo với chúng tôi tại: https://discord.com/invite/yJt8kzjzWZr   NrN  rU  s       r)   r7  zFastVieNeuTTS._decodeT  s   SS"*5JE*R*RSSS
z??at    	DHZrx888RZQRQRQR9RSEJ**511EE D DZuzBBB4qqq=QTTJ%  
..u5599;;AACC	D D D D D D D D D D D D D D D Q111W~rX  r  rY  r   c                     t          |          }t          |          }d                    d |D                       }d| d| d| }|S )zFormat prompt for LMDeployr^  c                     g | ]}d | d	S r`  ri   r  s     r)   r   z0FastVieNeuTTS._format_prompt.<locals>.<listcomp>q  r  r+   r  r[  r  r   rd  r`   r  r%  rY  ref_text_phonesinput_text_phonesro  r  s           r)   r  zFastVieNeuTTS._format_promptl  s{    -h77/
;;GGFFIFFFGG	U_ U UWh U UIRU U 	
 r+   r  r   r!  r"  r#  r$  r&  r'  r(  r)  r*  r+  c                    |,|                     d|          }|                     d|          }|||                     |          }nr| j        rk||gt          d| j                    	 |                     d          }|d         }|d         }n)# t
          $ r}t          d|            Y d}~nd}~ww xY w||t          d          |	| j        _        |
| j        _	        t          ||          }|s t          j        g t          j                  S t          |          d	k    rt          |t           j                  r&|                                                                }t          |t          j                  r&|                                                                }|                     |||d
                   }|                     |g| j        d          }|                     |d
         j                  }n2|                     |||||	|
          }t9          || j        ||          }| j        r!| j                            || j                  }|S )am  
        Single inference (automatically splits long text and uses batching for speed).
        
        Args:
            text: Input text to synthesize
            ref_audio: Path to reference audio for cloning
            ref_codes: Encoded reference audio codes
            ref_text: Reference text for reference audio
            max_chars: Maximum characters per chunk for splitting.
            voice: Optional dict with 'codes' and 'text'.
            temperature: Sampling temperature.
            top_k: Top-k sampling.
            
        Returns:
            Generated speech waveform as numpy array
        Nr   r   r-  r.  r/  r0  r   r   r   Fr  )r)  r*  r+  r1  ) r|   r  rZ   r6   r  r8   r   r  r*  r+  r   r   r2  r3  r   r   r3   r  r>   rT  r  r  ry  r  rw   r7  r   infer_batchr	   rI   r^   r9  )r`   r   r$  r  r%  r&  r'  r(  r)  r*  r+  defaultr   r:  r  	responsesr  r;  s                     r)   r@  zFastVieNeuTTS.inferz  s   " 		'955Iyy22H  Y%6--i88II  	Ki&78;KaDL_aabbbK0066$W-#FO K K KIaIIJJJJJJJJK  0deee '2# % (	BBB 	28Bbj1111v;;!)U\22 4%MMOO1133	)RZ00 9%--//6688	((HfQiHHFvh4?Z_``I,,y|011CC ''	85^iqv'wwH#Hd.>	;WWC  	V"223DDT2UUC
rA  textsr  c                     | j         }|-|                    d          |                    d          nr j        rkgt          d j                    	                      d          }|d         |d         n)# t
          $ r}	t          d|	            Y d}	~	nd}	~	ww xY wt          d          t          |t                    s|g}| j	        _
        | j	        _        d j	        _        t          t          j                  r&                                                                t          t"          j                  r&                                                                g }
t+          dt-          |          |          D ]}||||z            } fd	|D             }                     | j	        d
          }d |D             } fd|D             } j        r fd|D             }|
                    |           ||z   t-          |          k     r<t          j                                        rt          j                                         |
S )z5
        Batch inference for multiple texts.
        Nr   r   r-  r.  r/  r"  r   c                 >    g | ]}                     |          S ri   )r  )r   r   r  r%  r`   s     r)   r   z-FastVieNeuTTS.infer_batch.<locals>.<listcomp>  s+    ^^^$t**9hEE^^^r+   Fr  c                     g | ]	}|j         
S ri   )r   )r   responses     r)   r   z-FastVieNeuTTS.infer_batch.<locals>.<listcomp>  s    CCCX8=CCCr+   c                 :    g | ]}                     |          S ri   )r7  )r   r   r`   s     r)   r   z-FastVieNeuTTS.infer_batch.<locals>.<listcomp>  s%    GGG%$,,u--GGGr+   c                 R    g | ]#}j                             |j                   $S )r1  )r^   r9  rI   )r   wr`   s     r)   r   z-FastVieNeuTTS.infer_batch.<locals>.<listcomp>  s3    tttded.>>qdN^>__tttr+   )r  r|   rZ   r6   r  r8   r   r   r  r  r*  r+  repetition_penaltyr3   r  r>   rT  r   r  r  ry  r  r   rw   r^   extendrx   ry   rz   )r`   r  r  r%  r  r)  r*  r+  r  r   r;  r    batch_textspromptsr  batch_codes
batch_wavss   ` ``             r)   r  zFastVieNeuTTS.infer_batch  s    !!0N		'955Iyy22HH  	Ki&78;KaDL_aabbbK0066$W-#FO K K KIaIIJJJJJJJJK  0deee%&& 	GE '2# %-0*i.. 	0!--//Ii,, 	5!))++2244Iq#e**n55 	- 	-A!N"2 23K^^^^^^R]^^^Gg$/Y^__ICCCCCK HGGG;GGGJ  uttttisttt
OOJ'''>!CJJ..:**,, -J**,,,s   %B 
B+B&&B+c              #      K   |-|                     d|          }|                     d|          }nr| j        rk||gt          d| j                    	 |                     d          }|d         }|d         }n)# t          $ r}	t          d|	            Y d}	~	nd}	~	ww xY w||t          d          || j        _        || j        _        d| j        _	        t          ||          }
|
D ]}|                     |||          E d{V   dS )	a*  
        Streaming inference with low latency (supports long text by splitting into chunks).
        
        Args:
            text: Input text to synthesize
            ref_codes: Encoded reference audio codes
            ref_text: Reference text for reference audio
            max_chars: Maximum characters per chunk for splitting.
            voice: Optional dict with 'codes' and 'text'.
            temperature: Sampling temperature.
            top_k: Top-k sampling.
            
        Yields:
            Audio chunks as numpy arrays
        Nr   r   r-  r.  r/  r"  r0  )r|   rZ   r6   r  r8   r   r  r*  r+  r  r   _infer_stream_single)r`   r   r  r%  r&  r)  r*  r+  r  r   r:  r<  s               r)   rD  zFastVieNeuTTS.infer_stream  su       		'955Iyy22HH  	Ki&78;KaDL_aabbbK0066$W-#FO K K KIaIIJJJJJJJJK  0deee '2# %-0*'	BBB 	M 	ME00	8LLLLLLLLLL	M 	MrE  c           
   #     K   t          |t          j                  r&|                                                                }t          |t
          j                  r&|                                                                }| 	                    |||          }g }d |D             }d}t          |          }| j                            |g| j        d          D ]}	|	j        }
t          |          t          |          k    r?|
t          d                    |t          |          d                             d         n|
}|r|                    |           t          ||d                   | j        | j        z   k    rt'          || j        z
  | j        z
  d          }|| j        z   | j        z   | j        z   }||z
  | j        z  }|| j        d| j        z  z   | j        z  z   }|||         }|                     d                    |                    }|||         }|                    |           t1          || j                  }t          |          | j        z  }|||         }|}|| j        z  }|V  t          |          |z
  }|dk    rt'          t          |          | j        | j        z   |z   z
  d          }t          |          |z
  |z
  | j        z
  | j        z  }||d         }|                     d                    |                    }||d         }|                    |           t1          || j                  }||d         }|V  dS dS )	z7Internal method for streaming a single short text chunkc                     g | ]}d | d	S r`  ri   r  s     r)   r   z6FastVieNeuTTS._infer_stream_single.<locals>.<listcomp>+  s$    @@@s*3***@@@r+   r   Fr  r^  Nr   r  )r   r3   r  r>   rT  r   r  r  ry  r  r   rw   stream_inferr  r   rd  r8  rM   rN   r   rO   rL   rK   r7  r*   rP   )r`   r   r  r%  r  r  r  r  r  r  r=  
new_tokensr  r  r  r  r  rW  r  r  r  s                        r)   r  z"FastVieNeuTTS._infer_stream_single!  s     i.. 	0!--//Ii,, 	5!))++2244I$$Y$??@@i@@@y>>22F8gl2mm +	& +	&H!J UXXcTdTdgjktguguTuTuCC	NNOO0L(M(M$N$N$O$OPP  |FJ /"":... ;/00122d6UX\Xr6rrr  #$t'>>A^^   
 %5601 34  !1< ?4?R 6T=Z9ZZ^b^mmn 
 )j)@A
RWWZ%8%899l:56""5))) #6(E# # # #&k"2"2T5R"R"12CO2S"T$3! D$CC %%%% {++.>>aK  D$;d>[$[^n$no L
 K  </2BBTEbb L %\]]3JLL!4!455E,--(Eu%%%1+dFcdddO-.?.@.@AO!!!!!!!  r+   c                     t           j                                        rt           j                                         t	          j                     t          d           dS )zClean up GPU memoryu   🧹 Memory cleaned upN)r3   rx   ry   rz   rv   r   r6   rj   s    r)   cleanup_memoryzFastVieNeuTTS.cleanup_memoryp  sL    :""$$ 	%J""$$$

&'''''r+   c                     | j         | j        t          | j                  t          | j                  | j        j                            dd          ddS )z~
        Get current optimization statistics.
        
        Returns:
            Dictionary with optimization info
        r  r   T)triton_enabledr  cached_referencesactive_sessionskv_quantprefix_caching)r  r  r   r  r  r  __dict__r|   rj   s    r)   get_optimization_statsz$FastVieNeuTTS.get_optimization_statsw  sU     #2"1!$T_!5!5"4#344044^QGG"
 
 	
r+   )r  rx   r?   rx   r  r   Tr   Tr  Nrh   r  )NNNNr"  r#  r  )$r  r  r  r  rf   r[   r   r   r   r   r   r  r  rU   r  r  r  r  rI  r  r7  r  r  r   r  r3   r  r  r   r@  r  r   rD  r  r  r  ri   r+   r)   r  r    s         ./"L` L` L` L`\A A A A*L L L L L } }c } } } }@
 
 
< <3 < < < <(*b *b *b *bXJ J J:	B 	B 	BsTz    2cDj 2 2 2 2 23 3s 3 3s 3 3 3 3.3 C 3    0S    0S	 S c VY    B B# B#* B
UZUaHa Btw B  LO B  bg B  }B B  QU B  kp B  B B  LN  LV B B B BH8 8c 8rzEL7P 8cf 8  @C 8  SW 8  mr 8  AD 8  NR  SU  S]  N^ 8 8 8 8t'M 'M 'Mel1J 'M]` 'Mtw 'M  GK 'M  af 'M  ux 'M  BK  LN  LV  X\  ^b  Lb  Bc 'M 'M 'M 'MRM" M"el9R M"^a M"foprpz  }A  CG  qG  gH M" M" M" M"^( ( (
 
 
 
 
 
 
r+   r  c                   
    e Zd ZdZ	 	 	 	 	 d# fd	Zd Zd	ee         d
ededefdZ	d$dedee
z  d	ej        ej        z  d
edededededededej        fdZd%dedee
z  d	ej        ej        z  d
edededededeej        ddf         fdZd Zd&dedee
z  d	ej        ej        z  d
edededededededej        fdZd Zd'd ee         dee
z  d	ej        ej        z  d
ededededededed!edeej                 fd"Z xZS )(RemoteVieNeuTTSa7  
    Client for VieNeu-TTS running on a remote LMDeploy server.
    Extremely fast to initialize as it only loads the local codec.
    
    Use this for:
    - Production/SaaS environments
    - Instant SDK loading in multi-process applications
    - Connecting to a centralized high-performance GPU server
    http://localhost:23333/v1r  r?   r>   Nc                 @   |                     d          | _        || _        t                                          d|||           d| _        d| _        d| _        | j        | j        z  | _	        | 
                    ||           t          d| j                    dS )a  
        Initialize Remote Client.
        
        Args:
            api_base: Base URL of LMDeploy api_server
            model_name: Name of the model as registered on the server (usually HF Repo ID)
            codec_repo: Local codec for decoding
            codec_device: Device for local codec (usually 'cpu' is enough)
            hf_token: Optional HuggingFace token for private models/voices
        /N)ra   rc   rd   re   rE   r  r#  u+   📡 RemoteVieNeuTTS ready! Using backend: )rstripapi_base
model_namesuperrf   rM   rN   rO   rK   rP   r   r6   )r`   r  r  rc   rd   re   	__class__s         r)   rf   zRemoteVieNeuTTS.__init__  s    $ !,,$ 	!%	 	 	
 	
 	
 +-'%&""$(,(G$/(Y%##J999KDMKKLLLLLr+   c                     d S rh   ri   )r`   ra   rb   s      r)   rT   zRemoteVieNeuTTS._load_backbone  s    r+   r  r%  rY  r   c                     t          |          }t          |          }d                    d |D                       }d| d| d| }|S )z(Format prompt for remote LMDeploy serverr^  c                     g | ]}d | d	S r`  ri   r  s     r)   r   z2RemoteVieNeuTTS._format_prompt.<locals>.<listcomp>  r  r+   r  r[  r  r  r  s           r)   r  zRemoteVieNeuTTS._format_prompt  s{    -h77/
;;GGFFIFFFGG	U_ U UWh U UIRU U 	 r+   r  r   r!  r"  r#  r   r$  r&  r'  r(  r)  r*  r+  c           	      D   |,|                     d|          }|                     d|          }|||                     |          }nB| j        r;||7	 |                     d          }|d         }|d         }n# t          $ r Y nw xY w||t          d          t          ||          }|s t          j        g t          j	                  S g }|D ]}t          |t          j                  rK|                                                                                                                                }nCt          |t          j                  r'|                                                                }n|}|                     |||          }| j        d|dgd	|	|
d
gdd}	 t)          j        | j         d|d          }|                                 |                                d         d         d         d         }|                     |          }|                    |           b# t          $ r}t7          d|            Y d}~d}~ww xY wt9          || j        ||          }| j        r!| j                            || j                  }|S )a  
        Remote inference (automatically splits long text).
        
        Args:
            text: Input text to synthesize
            ref_audio: Path to reference audio for local encoding (before remote dispatch)
            ref_codes: Encoded reference audio codes
            ref_text: Reference text for reference audio
            max_chars: Maximum characters per chunk for splitting.
            silence_p (float): Seconds of silence to pad between chunks.
            crossfade_p (float): Seconds of crossfade between chunks (ignored if silence_p > 0).
            voice: Optional dict with 'codes' and 'text'.
            temperature: Sampling temperature.
            top_k: Top-k sampling.
            
        Returns:
            Generated speech waveform as numpy array
        Nr   r   r/  r0  r   userrolecontentrB   rq  Fr   messagesr  r*  r+  r  r  /chat/completions<   r   timeoutr  r   messager!  zError during remote inference: r1  ) r|   r  rZ   r  r8   r   r   r   r2  r3  r   r3   r  r>   rT  r  ry  r  r  r  requestspostr  raise_for_statusr   r7  r8  r6   r	   rI   r^   r9  )r`   r   r$  r  r%  r&  r'  r(  r)  r*  r+  r  r:  r;  r<  ref_codes_listr  payloadr  r=  r  r   r?  s                          r)   r@  zRemoteVieNeuTTS.infer  s   & 		'955Iyy22H  Y%6--i88II  	i&78;K!22488
&w/	%f-     0deee'	BBB 	28Bbj1111 !	 !	E)U\22 +!*!6!6!8!8!@!@!B!B!I!I!K!KIrz22 +!*!2!2!4!4!;!;!=!=!*((5IIF &,@@A"*45 G#=DM)L)L)LSZdfggg))+++ &]]__Y7:9EiP
 ll:..$$$$   ;;;<<<
 &h0@)[YY	 	b(88PTP`8aaIs+   %A; ;
BB,B	H77
IIIc	              #     K   |,|                     d|          }|                     d|          }|||                     |          }nB| j        r;||7	 |                     d          }	|	d         }|	d         }n# t          $ r Y nw xY w||t          d          t          ||          }
|
D ]!}|                     |||||          E d{V  "dS )z2
        Stream output audio (generator).
        Nr   r   r/  r0  )r|   r  rZ   r  r8   r   r   _infer_stream_chunk)r`   r   r$  r  r%  r&  r)  r*  r+  r  r:  r<  s               r)   rD  zRemoteVieNeuTTS.infer_stream%  s6      		'955Iyy22H  Y%6--i88II  	i&78;K!22488
&w/	%f-     0deee'	BBB 	` 	`E//y(KY^__________	` 	`s   %A= =
B
	B
c              #     K   t          |t          j                  rK|                                                                                                                                }nCt          |t          j                  r'|                                                                }n|}| 	                    |||          }| j
        d|dgd||dgdd}g }	d |D             }
d}t          |          }	 t          j        | j         d	|dd
          5 }|                                 |                                D ]}|s|                    d          }|                    d          s1|dd         }|dk    r n	 t'          j        |          d         d         d                             dd          }|rb|
                    |           t          |
|d                   | j        | j        z   k    r$t3          || j        z
  | j        z
  d          }|| j        z   | j        z   | j        z   }||z
  | j        z  }|| j        d| j        z  z   | j        z  z   }|
||         }|                     d                    |                    }| j        r!| j                             || j!                  }|||         }|	                    |           tE          |	| j#                  }t          |	          | j#        z  }|||         }|}|| j        z  }|V  # t&          j$        $ r Y w xY wddd           n# 1 swxY w Y   n*# tJ          $ r}tM          d|            Y d}~dS d}~ww xY wt          |
          |z
  }|dk    rt3          t          |
          | j        | j        z   |z   z
  d          }t          |
          |z
  |z
  | j        z
  | j        z  }|
|d         }|                     d                    |                    }||d         }|	                    |           tE          |	| j#                  }||d         }|V  dS dS )z-Internal helper to stream a single text chunkr  r  rB   rq  Tr"  c                     g | ]}d | d	S r`  ri   r  s     r)   r   z7RemoteVieNeuTTS._infer_stream_chunk.<locals>.<listcomp>V  s$    !P!P!P#"5c"5"5"5!P!P!Pr+   r   r$  r%  )r   r  r'  r   zdata:    Nz[DONE]r  deltar!  r^  r   r1  r  zError streaming chunk: )'r   r3   r  r>   rT  r  ry  r   r  r  r  r   r)  r*  r  r+  
iter_linesrx  r  r   loadsr|   r8  rM   rN   r   rO   rL   rK   r7  rd  r^   r9  rI   r*   rP   JSONDecodeErrorr8   r6   )r`   r<  r  r%  r*  r+  r,  r  r-  r  r  r  r  r   lineline_strdata_strr!  r  r  r  r  r  rW  r  r  r   r  s                               r)   r/  z#RemoteVieNeuTTS._infer_stream_chunk?  s     i.. 	'&]]__2244<<>>EEGGNN	2:.. 	'&..007799NN&N$$^XuEE _"(V<<=&01
 
 )+!P!P!P!P!P!" #N 3 33	-4= C C C'Z^hjkkk .!op""$$$LLNN +! +!D ! #{{733H#..x88 ! '|H8++!"&*X"6"6y"A!"DW"M"Q"QR[]_"`"`" 6(//888 !$K0@0A0A$B C CtGfim  jD  HD  !D  !D/23CdF]3]`d`}3}  @A  0B  0B-=@_-_bfb|-|  @D  @]  .]
0@<0OSWSb/b-9T=\_`cg  dA  `A  >A  EI  ET  =T  .T
-8j9P-Q
(,RWWZ5H5H(I(I $(#3 !r,0,<,L,LU`d`p,L,q,qE(-l:.E(F + 2 25 9 9 92EkZ^Zw2x2x2x25k2B2BTEb2b2ABSTcBc2d4C 1 0D4S S 0&5 5 5 5/ ! ! ! ![.! .! .! .! .! .! .! .! .! .! .! .! .! .! .!`  	 	 	/A//000FFFFF	
 {++.>>as;//43JTMj3jm}3}~  AB  C  CL,,|;>NNQUQnnrv  sB  BL$\]]3JLL!4!455E,--(Eu%%%1+dFcdddO-.?.@.@AO!!!!!!  sb   6 M A-MFL%#M%L84M7L88M;M MM MM 
M:M55M:c                   K   	 ddl }n# t          $ r t          d          w xY w|,|                    d|          }|                    d|          }|||                     |          }nB| j        r;||7	 |                     d          }|d         }|d         }n# t          $ r Y nw xY w||t          d          t          ||          }|s t          j
        g t          j                  S d	}||                                }d
}	 g }|D ]/}|                    |                     |||||	|
                     0t          j        |  d{V }t#          || j        ||          }| j        r!| j                            || j                  }||r|                                 d{V  S S # |r|                                 d{V  w w xY w)z<
        Asynchronous inference (Non-blocking I/O).
        r   N;Async requires 'aiohttp'. Install with: pip install aiohttpr   r   r/  r0  r   FTr1  )aiohttpr7   r|   r  rZ   r  r8   r   r   r   r2  r3  ClientSessionr8  _infer_chunk_asyncasynciogatherr	   rI   r^   r9  rn   )r`   r   r$  r  r%  r&  r'  r(  r)  r*  r+  sessionr<  r  r:  should_close_sessiontasksr<  wavsr?  s                       r)   infer_asynczRemoteVieNeuTTS.infer_async  s     	]NNNN 	] 	] 	][\\\	] 		'955Iyy22H  Y%6--i88II  	i&78;K!22488
&w/	%f-     0deee'	BBB 	28Bbj1111  %?++--G#' 	&E o oT44WeYPXZeglmmnnnn !///////D *$0@)[YYI f ,<<YTXTd<ee	 $ &mmoo%%%%%%%%&# &mmoo%%%%%%%%&s'   	 #:%B   
B-,B-BF7 7Gc                   K   t          |t          j                  rK|                                                                                                                                }nCt          |t          j                  r'|                                                                }n|}| 	                    |||          }| j
        d|dgd||dgdd}		 |                    | j         d|	d	          4 d
{V }
|
                                 |
                                 d
{V }|d         d         d         d         }|                     |          cd
d
d
          d
{V  S # 1 d
{V swxY w Y   d
S # t           $ r<}t#          d|            t          j        g t          j                  cY d
}~S d
}~ww xY w)z(Internal async helper for a single chunkr  r  rB   rq  Fr"  r$  r%  r&  Nr  r   r(  r!  zError in async chunk: r   )r   r3   r  r>   rT  r  ry  r   r  r  r  r*  r  r+  r   r7  r8   r6   r2  r3  )r`   rA  r<  r  r%  r*  r+  r,  r  r-  respr   r=  r   s                 r)   r>  z"RemoteVieNeuTTS._infer_chunk_async  sj     i.. 	'&]]__2244<<>>EEGGNN	2:.. 	'&..007799NN&N$$^XuEE _"(V<<=&01
 
	2||t}$G$G$Gg_a|bb 0 0 0 0 0 0 0fj%%'''!YY[[((((((!)_Q/	:9E
||J//	0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  	2 	2 	2.1..///8Bbj111111111	2sC   &F  =AE-F  -
E77F  :E7;F   
G
1G;GGr  concurrency_limitc                    	
K   	 ddl }n# t          $ r t          d          w xY w|,|                    d          |                    d          |                     |          nB j        r;7	                      d          }|d         |d         n# t          $ r Y nw xY wt          d          t          j	        |          |
                                4 d{V  	
f
dfd|D             }t          j        |  d{V }ddd          d{V  n# 1 d{V swxY w Y   |S )	z@
        High-performance Asynchronous Batch Inference.
        r   Nr;  r   r   r/  c                    
K   4 d {V                       | 	
	  	         d {V cd d d           d {V  S # 1 d {V swxY w Y   d S )N)r  r%  r&  r'  r(  r*  r+  rA  )rE  )r   r(  r&  r  r%  r`   semrA  r'  r*  r+  s    r)   bounded_inferz8RemoteVieNeuTTS.infer_batch_async.<locals>.bounded_infer  s             !%!1!1	H"+yk$/u '	 "2 " "                                   s   $A
AAc                 &    g | ]} |          S ri   ri   )r   r   rL  s     r)   r   z5RemoteVieNeuTTS.infer_batch_async.<locals>.<listcomp>  s#    ;;;T]]4((;;;r+   )r<  r7   r|   r  rZ   r  r8   r   r?  	Semaphorer=  r@  )r`   r  r$  r  r%  r&  r'  r(  r)  r*  r+  rH  r<  r  rC  resultsrL  rK  rA  s   `  ````` ``     @@@r)   infer_batch_asyncz!RemoteVieNeuTTS.infer_batch_async  st     	]NNNN 	] 	] 	][\\\	] 		'955Iyy22H Y%6--i88II  	i&78;K!22488
&w/	%f-     0deee  122((** 	3 	3 	3 	3 	3 	3 	3g              <;;;U;;;E#NE2222222G	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 s,    .%B+ +
B87B8<1D??
E	E	)r  r  r?   r>   Nr  )NNNr  Nr"  r#  )
NNNr  r   r!  Nr"  r#  N)
NNNr  r   r!  Nr"  r#  r#  )r  r  r  r  rf   rT   r  rI  r   r  r   r   r  r3   r  r  r   r@  r   rD  r/  rE  r>  rP  __classcell__)r  s   @r)   r  r    s         -*/$M $M $M $M $M $ML  S	 S c VY    V V# V#* V
UZUaHa Vtw V  LO V  bg V  }B V  QU V  kp V  B V  LN  LV V V V Vp` ` `t `rz\a\hOh `{~ `  SV `  ei `  D `  SV `  `i  jl  jt  vz  |@  j@  `A ` ` ` `4]" ]" ]"~7& 7&c 7&cDj 7&TVT^afamTm 7&  AD 7&  X[ 7&  ns 7&  IN 7&  ]a 7&  w| 7&  KN 7&  fh  fp 7& 7& 7& 7&r2 2 2>+ +T#Y +3: +acaknsnzaz +  NQ +  eh +  {@ +  V[ +  jn +  DI +  X[ +  ux +  BF  GI  GQ  BR + + + + + + + +r+   r  standardc                 X    | xdk    rn	xdk    rn n t          di |S 	 t          di |S )z
    Factory function for VieNeu-TTS.
    
    Args:
        mode: 'standard' (CPU/GPU-GGUF), 'remote' (API)
        **kwargs: Arguments for chosen class
        
    Returns:
        VieNeuTTS | RemoteVieNeuTTS instance
    remoteapiNri   )r  r<   )r/   kwargss     r)   VieneurW  $  sM     XXXX",,V,,,&&v&&&r+   )rR  )$pathlibr   typingr   r  rT  r   r3   r   r   r   vieneu_utils.phonemize_textr   vieneu_utils.core_utilsr   r	   collectionsr
   rO  rv   r   r)  r?  huggingface_hubr   concurrent.futuresr   r  r  rI  r*   r:   r<   r  r  rW  ri   r+   r)   <module>r_     s                     . . . . . . . . ; ; ; ; ; ; M M M M M M M M # # # # # # 				 				    + + + + + + 1 1 1 1 1 1RZ 0 # "*    4  6u
" u
" u
" u
" u
" u
" u
" u
"z}	
 }	
 }	
 }	
 }	
 }	
 }	
 }	
HU U U U Ui U U Up' ' ' ' ' 'r+   