
     `i"                     T    d Z ddlZej        dk    rddlZnddlZ G d d          ZdS )z"English Normalizer class for CLVP.    N)      c                       e Zd Zd ZdedefdZdedefdZdedefdZdedefd	Z	dedefd
Z
dedefdZdedefdZdedefdZdedefdZdedefdZd ZdS )EnglishNormalizerc                 ^    d dD             | _         g d| _        g d| _        g d| _        d S )Nc                 n    g | ]2}t          j        d |d         z  t           j                  |d         f3S )z\b%s\.r      )recompile
IGNORECASE).0xs     ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/clvp/number_normalizer.py
<listcomp>z.EnglishNormalizer.__init__.<locals>.<listcomp>   sH     
 
 
 Z
QqT)2=991Q4@
 
 
    ))mrsmisess)mrmister)drdoctor)stsaint)cocompany)jrjunior)majmajor)gengeneral)drsdoctors)revreverend)lt
lieutenant)hon	honorable)sgtsergeant)captcaptain)esqesquire)ltdlimited)colcolonel)ftfort)
 onetwothreefourfivesixseveneightnine)
teneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteen)
r6   r6   twentythirtyfortyfiftysixtyseventyeightyninety)_abbreviationsonesteenstens)selfs    r   __init__zEnglishNormalizer.__init__   s\    
 

 
 
0 a``	
 
 

 kjj			r   numreturnc                    |dk    rdS |dk     r%d|                      t          |                    z   S |dk     r| j        |         S |dk     r| j        |dz
           S |dk     r7| j        |dz           |dz  dk    rd|                      |dz            z   ndz   S |d	k     r:| j        |dz           d
z   |dz  dk    rd|                      |dz            z   ndz   S |dk     rB|                      |d	z            dz   |d	z  dk    rd|                      |d	z            z   ndz   S |dk     rB|                      |dz            dz   |dz  dk    rd|                      |dz            z   ndz   S |dk     rB|                      |dz            dz   |dz  dk    rd|                      |dz            z   ndz   S |dk     rB|                      |dz            dz   |dz  dk    rd|                      |dz            z   ndz   S |dk     rB|                      |dz            dz   |dz  dk    rd|                      |dz            z   ndz   S dS )ax  
        Converts numbers(`int`) to words(`str`).

        Please note that it only supports upto - "'nine hundred ninety-nine quadrillion, nine hundred ninety-nine
        trillion, nine hundred ninety-nine billion, nine hundred ninety-nine million, nine hundred ninety-nine
        thousand, nine hundred ninety-nine'" or `number_to_words(999_999_999_999_999_999)`.
        r   zerozminus 
      d   -r6      hundred i@B z	 thousand, i ʚ;z millionl    J)z billionl     I5 z	 trillionl     NZoz quadrillionznumber out of range)number_to_wordsabsrS   rT   rU   )rV   rX   s     r   rd   z!EnglishNormalizer.number_to_wordsF   s    !8861WWd223s88<<<<2XX9S>!2XX:cBh''3YY9SBY'SVY[S[_`S`S`31E1EcBh1O1O+O+Ofhii4ZZ	#*%
2_beh_hlm_m_mcD<P<PQTWZQZ<[<[6[6[suv 9__$$SD[11>ADjAoo4$..sTz::::SUW
 =  $$SI%566CF?VWCWCW4$..sY????]_a
 $$$$$SM%9::GJ]GZ^_G_G_4$..s]/BCCCCegi
 ((($$S,=%=>>KNQbKbfgKgKg4$..s5F/FGGGGmoq
 ,,,$$S,A%ABB ! 22a77 4//6K0KLLLL )(r   textc                 T    |                     dd                              d          S )z+
        Converts unicode to ascii
        asciiignorezutf-8)encodedecoderV   rf   s     r   convert_to_asciiz"EnglishNormalizer.convert_to_ascii   s&     {{7H--44W===r   mc                    |                     d          }|                    d          }t          |          dk    r|dz   S |d         rt          |d                   nd}t          |          dk    r|d         rt          |d                   nd}|r#|r!|dk    rdnd}|dk    rdnd	}|d
|d|d
|S |r|dk    rdnd}|d
|S |r|dk    rdnd	}|d
|S dS )zZ
        This method is used to expand numerical dollar values into spoken words.
        r	   .   z dollarsr   dollardollarscentcentsrb   rc   zzero dollars)groupsplitlenint)rV   rn   matchpartsrs   ru   dollar_unit	cent_units           r   _expand_dollarsz!EnglishNormalizer._expand_dollars   s*    

C  u::>>:%%#(82#eAh---!$UaE!HE!H! 	"u 	"&-ll((	K"'1**'I%,WWkkk555))LL 	"&-ll((	K%gg{{33 	""'1**'I#eeYY//!>r   c                 T    |                     d                              dd          S )zF
        This method is used to remove commas from sentences.
        r	   ,r6   rv   replacerV   rn   s     r   _remove_commasz EnglishNormalizer._remove_commas   s$     wwqzz!!#r***r   c                 T    |                     d                              dd          S )zO
        This method is used to expand '.' into spoken word ' point '.
        r	   rp   z point r   r   s     r   _expand_decimal_pointz'EnglishNormalizer._expand_decimal_point   s$     wwqzz!!#y111r   c                     dddd}t          |                    d          dd                   }d|d	z  k    r|d	z  d
k    rd}n|                    |dz  d          }|                     |          |z   S )z`
        This method is used to expand ordinals such as '1st', '2nd' into spoken words.
        r   ndrd)r	   rq   r   r   Nr\   r^   r]   th)ry   rv   getrd   )rV   rX   ordinal_suffixessuffixs       r   _expand_ordinalz!EnglishNormalizer._expand_ordinal   s      $66#))A,,ss#$$s??sSyBFF%))#(D99F##C((611r   c                 X   t          |                    d                    }|dk    rn|dk     rh|dk    rdS |dk    r!|dk     rd|                     |dz            z   S |dz  dk    r|                     |dz            d	z   S |                     |          S |                     |          S )
a  
        This method acts as a preprocessing step for numbers between 1000 and 3000 (same as the original repository,
        link :
        https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/utils/tokenizer.py#L86)
        r   r`   i  i  ztwo thousandi  ztwo thousand r^   ra   )ry   rv   rd   )rV   rn   rX   s      r   _expand_numberz EnglishNormalizer._expand_number   s     !''!**oo::#**d{{%~td

&)=)=cCi)H)HHHsa++C3J77*DD++C000'',,,r   c                 @   t          j        d| j        |          }t          j        dd|          }t          j        d| j        |          }t          j        d| j        |          }t          j        d| j        |          }t          j        d| j        |          }|S )z
        This method is used to normalize numbers within a text such as converting the numbers to words, removing
        commas, etc.
        z([0-9][0-9,]+[0-9])u   £([0-9,]*[0-9])z	\1 poundsz\$([0-9.,]*[0-9])z([0-9]++\.[0-9]+)z[0-9]++(st|nd|rd|th)z[0-9]+)r
   subr   r~   r   r   r   rl   s     r   normalize_numbersz#EnglishNormalizer.normalize_numbers   s    
 v,d.A4HHv)<>>v*D,@$GGv*D,FMMv-t/CTJJvi!4d;;r   c                 L    | j         D ]\  }}t          j        |||          }|S )z/
        Expands the abbreviate words.
        )rR   r
   r   )rV   rf   regexreplacements       r   expand_abbreviationsz&EnglishNormalizer.expand_abbreviations   s5     #'"5 	4 	4E;6%d33DDr   c                 R    t          j        t          j        d          d|          S )z.
        Removes multiple whitespaces
        z\s+rb   )r
   r   r   rl   s     r   collapse_whitespacez%EnglishNormalizer.collapse_whitespace   s"     vbj((#t444r   c                    |                      |          }|                                }|                     |          }|                     |          }|                     |          }|                    dd          }|S )z
        Converts text to ascii, numbers / number-like quantities to their spelt-out counterparts and expands
        abbreviations
        "r6   )rm   lowerr   r   r   r   rl   s     r   __call__zEnglishNormalizer.__call__   su     $$T**zz||%%d++((..''--||C$$r   N)__name__
__module____qualname__rW   ry   strrd   rm   r~   r   r   r   r   r   r   r   r    r   r   r   r      s       'k 'k 'kR9)3 9)3 9) 9) 9) 9)v>S >S > > > >" " " " " "0+ + + + + +2s 2s 2 2 2 223 23 2 2 2 2- - - - - -(c c         5 5 5 5 5 5    r   r   )__doc__sysversion_infor
   r   r   r   r   r   <module>r      sy     ) ( 



 wIIIIX X X X X X X X X Xr   