
    PiK                        d dl Z  G d d          Zedk    rp e            ZddgZ ed            ed            ed           eD ]@Z ed	e            e                    e          Z ed
e             ed           ?dS dS )    Nc                   x    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd ZdS )VietnameseTTSNormalizerz
    A text normalizer for Vietnamese Text-to-Speech systems.
    Converts numbers, dates, units, and special characters into readable Vietnamese text.
    c                    i ddddddddd	d
ddddddddddddddddddddddddi dd d!d d"d#d$d%d&d%d'd(d)d(d*d+d,d+d-d.d/d.d0d1d2d3d4d5d6d7d8d9d:d;i d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdSdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidj| _         g dk| _        d S )lNkmu   ki lô métdmu   đê xi métcmu   xen ti métmmu
   mi li métnmu   na nô métu   µmu   mic rô métu   μmmu   métkgu
   ki lô gamggammgz	mi li gamu   km²u   ki lô mét vuôngkm2u   m²u   mét vuôngm2u   cm²u   xen ti mét vuôngcm2u   mm²u   mi li mét vuôngmm2hau   héc tau   km³u   ki lô mét khốikm3u   m³u   mét khốim3u   cm³u   xen ti mét khốicm3u   mm³u   mi li mét khốimm3lu   lítdlu   đê xi lítmlu
   mi li líthlu   héc tô lítvu   vônkvu   ki lô vônmvu
   mi li vônazam pemazmi li am pekau   ki lô am pewu   oátkwu   ki lô oátmwu   mê ga oátgwu
   gi ga oátkwhu   ki lô oát giờmwhu   mê ga oát giờwhu
   oát giờu   ωu   ômohmu   kωu
   ki lô ômu   mωu
   mê ga ômhzu   héckhzu   ki lô hécu   mê ga hécu
   gi ga hécu   pát calu   ki lô pát calu   mê ga pát calbazmi li bau   át mốt phiau
   pi ét xaigiunu   ki lô giunzca lou   ki lô ca lo)mhzghzpakpampabarmbaratmpsijkjcalkcal)
   khôngu   mộthair-   u   bốnu   nămu   sáuu   bảyu   támu   chín)unitsdigits)selfs    o/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vieneu_utils/normalize_text.py__init__z VietnameseTTSNormalizer.__init__	   s    
- 
!%~ 
7;] 
, 
 $m 
5:N 
 > 
 $' 

 , 

 !$U 

 -1+ 
 ( 
 +01E 
 = 
 #' 
 ( 
 +01E 
  
 ' 
 */0C 
 ) 
 ( 
 +01E 
 = 
 #' 
 ( 
 +01E 
  '! 
  */0C! 
" # 
" ~# 
" 04\# 
" DH# 
& ' 
& }' 
  
  
& /3L' 
( ) 
( ) 
( 04^) 
* + 
* }+ 
* /3M+ 
* DH+ 
, &- 
, )./B- 
, EI,- 
. %/ 
. / 
. (-l/ 
. =B</ 
2 &3 
2  3 
  
2 8E\%6?P4D\}N? 
  
  

D@ @ @    c                 R   g dfd}t          j        d||t           j                  }|                                }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }| 	                    |          }| 
                    |          }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }t!                    D ]C\  }}|                                        |                                          |dz             }D|                     |          }|S )z3Main normalization pipeline with EN tag protection.z___EN_PLACEHOLDER_{}___ c                                          |                     d                                         t                    dz
            S )Nr      )appendgroupformatlen)matchen_contentsplaceholder_patterns    rA   
extract_enz5VietnameseTTSNormalizer.normalize.<locals>.extract_en5   sB    u{{1~~...&--c+.>.>.BCCCrC   z<en>.*?</en>flags )resub
IGNORECASElower_normalize_temperature_normalize_currency_normalize_percentage_normalize_units_normalize_time_normalize_date_normalize_phone_normalize_versions_normalize_numbers_number_to_words_normalize_special_chars_normalize_whitespace	enumeratereplacerI   )r@   textrN   idx
en_contentrL   rM   s        @@rA   	normalizez!VietnameseTTSNormalizer.normalize/   s    8	D 	D 	D 	D 	D 	D voz4r}MMM zz||**400''--))$//$$T**##D))##D))$$T**''--&&t,,$$T**,,T22))$//  )55 	[ 	[OC<< 3 : :3 ? ? E E G GVYIYZZDD ))$//rC   c                 B   t          j        dd|t           j                  }t          j        dd|t           j                  }t          j        dd|t           j                  }t          j        dd	|t           j                  }t          j        d
d|          }|S )z&Convert temperature notation to words.u   -(\d+(?:[.,]\d+)?)\s*°\s*c\bu   âm \1 độ xêrO   u   -(\d+(?:[.,]\d+)?)\s*°\s*f\bu   âm \1 độ épu   (\d+(?:[.,]\d+)?)\s*°\s*c\bu   \1 độ xêu   (\d+(?:[.,]\d+)?)\s*°\s*f\bu   \1 độ ép   °u    độ rR   rS   rT   r@   rd   s     rA   rV   z.VietnameseTTSNormalizer._normalize_temperatureS   s    v68KTY[Yfgggv68KTY[Yfgggv5TVTabbbv5TVTabbbveY--rC   c                      fd}t          j        d||t           j                  }t          j        dd|t           j                  }t          j        dd|t           j                  }t          j        dd	|t           j                  }t          j        d
d|          }t          j        dd|t           j                  }t          j        dd|          }t          j        dd|          }|S )z#Convert currency notation to words.c                 4   |                      d          }|                      d          }|                      d          }d                    fd|D                       }dddd	}|                    |                                |          }| d
| d| S )NrF         rQ   c                 D    g | ]}j         t          |                   S  r?   int.0dr@   s     rA   
<listcomp>zYVietnameseTTSNormalizer._normalize_currency.<locals>.decimal_currency.<locals>.<listcomp>b   &    %K%K%Kadk#a&&&9%K%K%KrC   u   nghìnu   triệuu   tỷ)kr   bu    phẩy )rH   joingetrU   )rK   wholedecimalunitdecimal_wordsunit_map	unit_wordr@   s          rA   decimal_currencyzEVietnameseTTSNormalizer._normalize_currency.<locals>.decimal_currency^   s    KKNNEkk!nnG;;q>>DHH%K%K%K%K7%K%K%KLLM%IFCCH TZZ\\488I@@]@@Y@@@rC   z(\d+)[.,](\d+)\s*([kmb])\brO   z(\d+)\s*k\bu	   \1 nghìnz(\d+)\s*m\bu
   \1 triệuz(\d+)\s*b\bu   \1 tỷu   (\d+(?:[.,]\d+)?)\s*đ\bu
   \1 đồngz(\d+(?:[.,]\d+)?)\s*vnd\bz\$\s*(\d+(?:[.,]\d+)?)u
   \1 đô laz(\d+(?:[.,]\d+)?)\s*\$rj   )r@   rd   r   s   `  rA   rW   z+VietnameseTTSNormalizer._normalize_currency\   s    	A 	A 	A 	A 	A v35EtSUS`aaavnlDNNNvnmTOOOvnj$bmLLLv1=$GGv2M4r}]]]v/EEv/EErC   c                 2    t          j        dd|          }|S )zConvert percentage to words.z(\d+(?:[.,]\d+)?)\s*%u   \1 phần trămrR   rS   rk   s     rA   rX   z-VietnameseTTSNormalizer._normalize_percentageq   s    v.0BDIIrC   c                      fd} fd}t          j        d||          }t          j        d||          }t           j                                        d d          }|D ]D\  }dt          j                  z   d	z   }t          j        |d
| |t           j                  }E|D ]\\  }t          fddD                       r<d	t          j                  z   d	z   }t          j        |||t           j                  }]|S )z#Convert measurement units to words.c                 J   |                      d          }|                      d                                          }|                      d                                          }j                            ||          }j                            ||          }| d| d| S )NrF   rn   ro   rQ       trên rH   rU   r>   r|   )rK   numberunit1unit2
full_unit1
full_unit2r@   s         rA   expand_compound_with_numberzMVietnameseTTSNormalizer._normalize_units.<locals>.expand_compound_with_numberx   s    [[^^FKKNN((**EKKNN((**Eu55Ju55J>>z>>*>>>rC   c                    |                      d                                          }|                      d                                          }j                            ||          }j                            ||          }| d| S )NrF   rn   r   r   )rK   r   r   r   r   r@   s        rA   expand_compound_without_numberzPVietnameseTTSNormalizer._normalize_units.<locals>.expand_compound_without_number   sw    KKNN((**EKKNN((**Eu55Ju55J 55555rC   uD   (\d+(?:[.,]\d+)?)\s*([a-zA-Zμµ²³°]+)/([a-zA-Zμµ²³°0-9]+)\bu2   \b([a-zA-Zμµ²³°]+)/([a-zA-Zμµ²³°0-9]+)\bc                 ,    t          | d                   S Nr   )rJ   )xs    rA   <lambda>z:VietnameseTTSNormalizer._normalize_units.<locals>.<lambda>   s    AaD		 rC   T)keyreversez(\d+(?:[.,]\d+)?)\s*z\bz\1 rO   c              3       K   | ]}|v V  	d S )Nrq   )ru   cr   s     rA   	<genexpr>z;VietnameseTTSNormalizer._normalize_units.<locals>.<genexpr>   s'      //19//////rC   u   ²³°)rR   rS   sortedr>   itemsescaperT   any)r@   rd   r   r   sorted_units	full_namepatternr   s   `      @rA   rY   z(VietnameseTTSNormalizer._normalize_unitsv   sV   	? 	? 	? 	? 	?	6 	6 	6 	6 	6 v]0$8 8vK3T; ; dj..006I6ISWXXX+ 	R 	ROD)-	$?%GG6'#5)#5#5t2=QQQDD+ 	M 	MOD)////h///// M")D//1E9vgy$bmLLLrC   c                     d }t          j        d||          }t          j        d||          }t          j        d||          }t          j        d||          }|S )z/Convert time notation to words with validation.c                 0   |                                  }t          |          dk    r|\  }}}t          |          t          |          t          |          }}}d|cxk    rdk    sn |                     d          S d|cxk    rdk    sn |                     d          S d|cxk    rdk    sn |                     d          S | d| d| dS t          |          dk    rs|\  }}t          |          t          |          }}d|cxk    rdk    sn |                     d          S d|cxk    rdk    sn |                     d          S | d| d	S |d         }t          |          }d|cxk    rdk    sn |                     d          S | d
S )z+Validate time components before converting.ro   r      ;   u    giờ u    phút u    giâyrn   u    phútu    giờ)groupsrJ   rs   rH   )rK   r   hourminutesecondhour_int
minute_int
second_ints           rA   validate_and_convert_timezJVietnameseTTSNormalizer._normalize_time.<locals>.validate_and_convert_time   s   \\^^F 6{{a'-$ff36t99c&kk3v;;j*X++++++++ ;;q>>)Z----2---- ;;q>>)Z----2---- ;;q>>)DDvDDfDDDD V!!%f'*4yy#f++*X++++++++ ;;q>>)Z----2---- ;;q>>)55v5555 ayt99X++++++++ ;;q>>)&rC   z(\d{1,2}):(\d{2}):(\d{2})z(\d{1,2}):(\d{2})z(\d{1,2})h(\d{2})z(\d{1,2})h\br   )r@   rd   r   s      rA   rZ   z'VietnameseTTSNormalizer._normalize_time   sl    &	' &	' &	'P v24MtTTv*,EtLLv*,EtLLvo'@$GGrC   c                    d fdfd}fdt          j        dfd|          }t          j        dfd|          }t          j        d	||          }t          j        d
|          }t          j        d|          }|S )z/Convert date notation to words with validation.c                     t          |           t          |          t          |          }}} d| cxk    rdk    sn dS d|cxk    rdk    sn dS dS )z#Check if date components are valid.rF      F   T)rs   )daymonthyears      rA   is_valid_datez>VietnameseTTSNormalizer._normalize_date.<locals>.is_valid_date   sf    "3xxUSYYCNNNNNNNNu$$$$"$$$$u4rC   c                     |                                  \  }}} |||          rd| d| d| S |                     d          S N   ngày     tháng     năm r   r   rH   )rK   r   r   r   r   s       rA   date_to_textz=VietnameseTTSNormalizer._normalize_date.<locals>.date_to_text   s\    $||~~C}S%.. A@@@U@@$@@@;;q>>!rC   c                     |                                  \  }}} |||          rd| d| d| S |                     d          S r   r   )rK   r   r   r   r   s       rA   date_iso_to_textzAVietnameseTTSNormalizer._normalize_date.<locals>.date_iso_to_text   s\    $||~~D%}S%.. A@@@U@@$@@@;;q>>!rC   c                     |                                  \  }}}t          |          dk     rd| nd| } |||          rd| d| d| S |                     d          S )N2   2019r   r   r   r   )r   rs   rH   )rK   r   r   r   	full_yearr   s        rA   date_short_yearz@VietnameseTTSNormalizer._normalize_date.<locals>.date_short_year   s    $||~~C'*4yy2~~T;;;I}S%33 FEEEUEE)EEE;;q>>!rC   u/   \bngày\s+(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\bc                 B     |                                dd          S Nu   ngày ngàyu   ngàyrc   )r   r   s    rA   r   z9VietnameseTTSNormalizer._normalize_date.<locals>.<lambda>   s    ll1oo55mWMM rC   u/   \bngày\s+(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\bc                 B     |                                dd          S r   r   )r   r   s    rA   r   z9VietnameseTTSNormalizer._normalize_date.<locals>.<lambda>   s!    ooa0088PP rC   z\b(\d{4})-(\d{1,2})-(\d{1,2})\bz'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\bz'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\br   )r@   rd   r   r   r   r   s      @@@rA   r[   z'VietnameseTTSNormalizer._normalize_date   s    		 		 			" 	" 	" 	" 	"	" 	" 	" 	" 	"	" 	" 	" 	" 	" vHMMMMtU UvHPPPPRVX Xv8:JDQQv@,PTUUv@/SWXXrC   c                 j      fd}t          j        d||          }t          j        d||          }|S )z0Convert phone numbers to digit-by-digit reading.c                 t   |                      d          }t          j        dd|          }|                    d          r t	          |          dk    rd|dd          z   }dt	          |          cxk    rdk    r)n n&fd	|D             }d
                    |          d
z   S |                      d          S )Nr   z[^\d] 84
   0rn      c                 D    g | ]}j         t          |                   S rq   rr   rt   s     rA   rw   zSVietnameseTTSNormalizer._normalize_phone.<locals>.phone_to_text.<locals>.<listcomp>   s&    <<<SVV,<<<rC   rQ   )rH   rR   rS   
startswithrJ   r{   )rK   phonewordsr@   s      rA   phone_to_textz?VietnameseTTSNormalizer._normalize_phone.<locals>.phone_to_text   s    KKNNEF8R//E%% (#e***:*:eABBiSZZ%%%%2%%%%%<<<<e<<<xx,,;;q>>!rC   z"(\+84|84)[\s\-\.]?\d[\d\s\-\.]{7,}z\b0\d[\d\s\-\.]{8,}r   )r@   rd   r   s   `  rA   r\   z(VietnameseTTSNormalizer._normalize_phone   sK    	" 	" 	" 	" 	" v;]DQQv,mTBBrC   c                 8    d }t          j        d||          }|S )z,Convert version numbers like 1.0.4 to words.c                 |    |                      d                              d          }d                    |          S )Nr   .u    chấm )rH   splitr{   )rK   partss     rA   version_to_textzDVietnameseTTSNormalizer._normalize_versions.<locals>.version_to_text
  s3    KKNN((--E ??5)))rC   z\b\d+(?:\.\d+){1,}\br   )r@   rd   r   s      rA   r]   z+VietnameseTTSNormalizer._normalize_versions  s,    	* 	* 	* v-EErC   c                      t          j        dd |          }t          j        dd |          } fd}t          j        d||          }t          j        d||          }|S )Nz(\d+(?:[,.]\d+)?)%c                 2    |                      d           dS )NrF   u    phần trăm)rH   r   s    rA   r   z<VietnameseTTSNormalizer._normalize_numbers.<locals>.<lambda>  s    !''!**7S7S7S rC   z(\d{1,3})(?:\.(\d{3}))+c                 T    |                      d                              dd          S )Nr   r   r   )rH   rc   r   s    rA   r   z<VietnameseTTSNormalizer._normalize_numbers.<locals>.<lambda>  s!    AGGAJJ<N<NsTV<W<W rC   c                     |                      d          }|                      d          }d                    fd|D                       }d|                      d          v rdnd}| d| d| S )	NrF   rn   rQ   c                 D    g | ]}j         t          |                   S rq   rr   rt   s     rA   rw   zXVietnameseTTSNormalizer._normalize_numbers.<locals>.decimal_to_words.<locals>.<listcomp>  rx   rC   ,r   u   phẩyu   chấm)rH   r{   )rK   r}   r~   r   	separatorr@   s        rA   decimal_to_wordszDVietnameseTTSNormalizer._normalize_numbers.<locals>.decimal_to_words  s    KKNNEkk!nnGHH%K%K%K%K7%K%K%KLLM$'5;;q>>$9$9xI99i99-999rC   z(\d+),(\d+)z(\d+)\.(\d{1,2})\br   )r@   rd   r   s   `  rA   r^   z*VietnameseTTSNormalizer._normalize_numbers  s~    v+-S-SUYZZv02W2WY]^^	: 	: 	: 	: 	: vn&6==v+-=tDDrC   c                 L   |dk     r| j         |         S |dk    rdS |dk     r|dk    rdS d| j         |dz            S |dz  }|dz  }|dk    r| j         |          dS |d	k    r| j         |          d
S |dk    r| j         |          dS | j         |          d| j         |          S )z%Read two-digit numbers in Vietnamese.r   u   mười      u   mười lămu   mười r   u    mươirF   u    mươi mốt   u    mươi lămu    mươi )r?   )r@   ntensoness       rA   _read_two_digitsz(VietnameseTTSNormalizer._read_two_digits&  s    r66;q>!"WW9VVBww%~3dk!b&13337Dr6Dqyy+d+4444+d+::::+d+9999+d+HHT[5FHHHrC   c                     |dk     r|                      |          S |dz  }|dz  }| j        |          d}|dk    r|S |dk     r|d| j        |          z  }n|d|                      |           z  }|S )z'Read three-digit numbers in Vietnamese.d   u    trămr   r   u    lẻ rQ   )r   r?   )r@   r   hundreds	remainderresults        rA   _read_three_digitsz*VietnameseTTSNormalizer._read_three_digits<  s    s77((+++8G	K)111>>M^^7t{95777FF<$//	::<<<FrC   c                    |dk    rdS |dk     rd|                      |            S |dk    rE|dz  }|dz  }|                     |           d}|dk    r|d|                      |           z  }|S |dk    rE|dz  }|dz  }|                     |           d}|dk    r|d|                      |           z  }|S |d	k    r|d	z  }|d	z  }|                     |           d
}|dk    rW|dk     r|d| j        |          z  }n=|dk     r|d|                     |           z  }n|d|                     |           z  }|S |                     |          S )z%Convert a number to Vietnamese words.r   r<   u   âm i ʚ;u    tỷrQ   i@B u    triệui  u    nghìnr   u    không trăm lẻ r   u    không trăm )_convert_number_to_wordsr   r?   r   )r@   numbillionr   r   millionthousands          rA   r   z0VietnameseTTSNormalizer._convert_number_to_wordsN  s   !88877?$77==???*Z'Gj(I//88???F1}}Hd;;IFFHHHMG^^WnGgI//88BBBF1}}Hd;;IFFHHHMD[[d{Hd
I//99BBBF1}}r>>LDK	4JLLLFF__Qt/D/DY/O/OQQQFFF$"9"9)"D"DFFFFM **3///rC   c                 >      fd}t          j        d||          }|S )z'Convert all remaining numbers to words.c                 r    t          |                     d                    }                    |          S r   )rs   rH   r   )rK   r   r@   s     rA   convert_numberz@VietnameseTTSNormalizer._number_to_words.<locals>.convert_numberx  s.    ekk!nn%%C00555rC   z\b\d+\br   )r@   rd   r   s   `  rA   r_   z(VietnameseTTSNormalizer._number_to_wordsv  s6    	6 	6 	6 	6 	6 vj.$77rC   c                 J   |                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     d	d
          }|                     dd          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }|S )zHandle special characters."r   'z", '')
        text = text.replace(&u    và +u    cộng =u    bằng #u    thăng z[\(\[\{]\s*(.*?)\s*[\)\]\}]z, \1, z[\[\]\(\)\{\}]rQ   u/   (?:\s+|^)[-–—]\s*(.*?)\s*[-–—](?:\s+|$)z, \1 , u   \s+[-–—]+\s+z, u   ^[-–—]+\s+z\s*,\s*z,\s*,+r   z\.{2,}z\s+\.\s+u   [^\w\sàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ.,!?;:@%_])rc   rR   rS   rk   s     rA   r`   z0VietnameseTTSNormalizer._normalize_special_chars  s    ||C$$||C$$||  !#% %||C$$||C$$||C))||C,,||C,,||C,,v4iFF v'd33 vH*VZ[[ v)466 v't44 vj$--vid++vid++vk3--v  ]  _b  dh  i  irC   c                 Z    t          j        dd|          }|                                }|S )zNormalize whitespace.z\s+rQ   )rR   rS   striprk   s     rA   ra   z-VietnameseTTSNormalizer._normalize_whitespace  s'    vfc4((zz||rC   N)__name__
__module____qualname____doc__rB   rg   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r   r   r   r_   r`   ra   rq   rC   rA   r   r      s*        
$@ $@ $@L" " "H    *  
     D0 0 0d) ) )V  &     I I I,  $&0 &0 &0P  $ $ $L    rC   r   __main__u<  Chỉ cần thay đổi một dấu thanh, ý nghĩa của từ đã hoàn toàn khác biệt. Ví dụ như "ma", "má", "mà", "mả", "mã", "mạ" – đây chính là "bài toán khó" mà các kỹ sư công nghệ phải giải quyết để tạo ra một giọng đọc tự nhiên như người bản xứ.uR   Phiên bản hiện tại là 1.0.4 và địa chỉ IP của tôi là 192.168.1.1zP================================================================================z/VIETNAMESE TTS NORMALIZATION TEST (WITH EN TAG)u   
📝 Input: u   🎵 Output: zP--------------------------------------------------------------------------------)	rR   r   r  
normalizer
test_textsprintrd   rg   
normalizedrq   rC   rA   <module>r     s   				f f f f f f f fR z((**J 	U\J
 
E(OOO	E
;<<<	E(OOO  %t%%&&&))$//
*j**+++h!  rC   