
     `iIY                     8   d dl Z d dlZd dlmZ d dlmZ d dl mZ d dlmZm	Z	 d dl
Z
i dddd	d
ddddddddddddddddddddddddd d!d"Zd/d$efd%Zd$efd&Z G d' d(          Z G d) d*          Z G d+ d,          Z G d- d.          ZdS )0    N)Iterator)Fraction)Match)OptionalUnionu   œoeu   ŒOE   øo   ØO   æae   ÆAE   ßssu   ẞSSu   đdu   ĐD   ð   Ð   þth   Þu   łlu   ŁL sc                 x    fdd                     fdt          j        d|           D                       S )z
    Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some
    manual mappings)
    c                     | v r| S | t           v rt           |          S t          j        |           dk    rdS t          j        |           d         dv rdS | S )NMnr   r   MSP )ADDITIONAL_DIACRITICSunicodedatacategory)charkeeps    /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/whisper/english_normalizer.pyreplace_characterz8remove_symbols_and_diacritics.<locals>.replace_character5   sh    4<<K***(..!$''4//2!$''*e333    r   c              3   .   K   | ]} |          V  d S N ).0cr+   s     r*   	<genexpr>z0remove_symbols_and_diacritics.<locals>.<genexpr>C   s/      RRA$$Q''RRRRRRr,   NFKDjoinr&   	normalize)r   r)   r+   s    `@r*   remove_symbols_and_diacriticsr7   /   sS         77RRRR1Fvq1Q1QRRRRRRr,   c                 f    d                     d t          j        d|           D                       S )z[
    Replace any other markers, symbols, punctuations with a space, keeping diacritics
    r   c              3   V   K   | ]$}t          j        |          d          dv rdn|V  %dS )r   r#   r$   N)r&   r'   )r0   r1   s     r*   r2   z!remove_symbols.<locals>.<genexpr>J   s@      oo+.q11!4==331oooooor,   NFKCr4   r   s    r*   remove_symbolsr<   F   s4     77ookNcdjlmNnNnoooooor,   c                   ,    e Zd ZddedefdZdefdZdS )	BasicTextNormalizerFremove_diacriticssplit_lettersc                 >    |rt           nt          | _        || _        d S r.   )r7   r<   cleanr@   )selfr?   r@   s      r*   __init__zBasicTextNormalizer.__init__N   s"    6G[22^
*r,   r   c                 t   |                                 }t          j        dd|          }t          j        dd|          }|                     |                                           }| j        r3d                    t          j        d|t          j                            }t          j        dd|          }|S )N[<\[][^>\]]*[>\]]r   \(([^)]+?)\)r$   z\X\s+)	lowerresubrB   r@   r5   regexfindallUrC   r   s     r*   __call__zBasicTextNormalizer.__call__R   s    GGIIF'Q//F?B**JJqMM!! 	;ua99::AF63""r,   N)FF)__name__
__module____qualname__boolrD   strrP   r/   r,   r*   r>   r>   M   sV        + +$ +t + + + +#      r,   r>   c                   n     e Zd ZdZ fdZdee         dee         fdZdefdZ	defdZ
defd	Z xZS )
EnglishNumberNormalizerav  
    Convert any spelled-out numbers into arabic numbers, while handling:

    - remove any commas
    - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
    - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
    - spell out `one` and `ones`
    - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
    c                 f   t                                                       h d| _        d t          g dd          D             | _        d | j                                        D             | _        ddd	d
dddd | j                                        D             | _        i | j        | j        | _        ddddddddd| _	        d | j	                                        D             | _
        d | j	                                        D             | _        i | j
        | j        | _        ddddddd d!d"d#d$d%d&| _        d' | j                                        D             | _        d( | j                                        D             | _        i | j        | j        | _        h | j        | j	        | j        | _        d)d)d*d*d+| _        d,d,d-d-d.d.d/d/d0| _        t)          t+          | j                                                  t+          | j                                                  z             | _        d1d2id2d3| _        h d4| _        d5 | j        | j        | j        | j	        | j        | j        | j        | j        | j        | j        | j        fD             | _        d6d7h| _        d S )8N>   r   ohzeroc                     i | ]\  }}||	S r/   r/   )r0   inames      r*   
<dictcomp>z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>p   s.     
 
 
4 !
 
 
r,   )onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteen   )startc                 4    i | ]\  }}|d k    rdn|dz   |dfS )rd   sixesr   r/   r0   r]   values      r*   r^   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>x   s?     
 
 
GRtUtu}}GG$*ucl
 
 
r,   )r   r   )rr   st)   nd)   rd)   r   )   r   )zerothfirstsecondthirdfifthtwelfthc                 v    i | ]6\  }}|d k    |dk    |dk    ||                     d          rdndz   |df7S )r{   r}   r~   thr   )endswithrv   s      r*   r^   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>   s]       D%199! t}}S11;t<udm0;r,         (   2   <   F   P   Z   )twentythirtyfortyfiftysixtyseventyeightyninetyc                 F    i | ]\  }}|                     d d          |dfS )yiesr   replacerv   s      r*   r^   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>   s0    hhh{tUDLLe44uclhhhr,   c                 F    i | ]\  }}|                     d d          |dfS )r   iethr   r   rv   s      r*   r^   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>   s1    kkk+$PUT\\#v66kkkr,   d     i@B i ʚ;l    J)l     I5 l     NZol     @=7M.cl      B3v^!< l      P ~cegl       73Me'l       (l
F3YHqS )hundredthousandmillionbilliontrillionquadrillionquintillion
sextillion
septillion	octillion	nonillion	decillionc                 $    i | ]\  }}|d z   |d fS r;   r/   rv   s      r*   r^   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>   s&    "h"h"he4#:s|"h"h"hr,   c                 $    i | ]\  }}|d z   |d fS )r   r/   rv   s      r*   r^   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>   s&    #k#k#k;4D4K%#k#k#kr,   -+)minusnegativepluspositive   £u   €$   ¢)poundpoundseuroeurosdollardollarscentcentsr   %)perpercent>   andpointdoubletriplec                     h | ]	}|D ]}|
S r/   r/   )r0   mappingkeys      r*   	<setcomp>z3EnglishNumberNormalizer.__init__.<locals>.<setcomp>   sA     
 
 
 
 
  
 
 
 
r,   r_   ones)superrD   zeros	enumerater   itemsones_pluralones_ordinalones_suffixedtenstens_pluraltens_ordinaltens_suffixedmultipliersmultipliers_pluralmultipliers_ordinalmultipliers_suffixeddecimalspreceding_prefixersfollowing_prefixerssetlistvaluesprefixes	suffixersspecialswordsliteral_words)rC   	__class__s    r*   rD   z EnglishNumberNormalizer.__init__k   sq   (((

 
$ G  G  G  
 
 
	
 
VZV_VeVeVgVg
 
 
  !
 
 #'9??#4#4  
 G 0FD4EF 	
 	
	 ihVZV_VeVeVgVghhhkkY]YbYhYhYjYjkkkF 0FD4EF  $)047;>BF
 
 #i"htO_OeOeOgOg"h"h"h#k#kRVRbRhRhRjRj#k#k#k $[t'>$[$BZ$[!=$)=di=$*= 	$
 $
  	$
 	$
  D!9!@!@!B!BCCd4KcKjKjKlKlFmFmmnnC=
 
 =<<
 
 
	"	" )((
 
 

" $V_r,   r   returnc              #     K   d d d}dt           fd}dt          t           t          f         ffd}t          |          dk    rd S t	          |          D ]\  }}|dk    r||dz
           nd }|t          |          dz
  k    r||dz            nd }|rd}A|d uot          j        d|          }	|d         | j        v }
|
r
|dd          n|}t          j        d|          r ||          }|t          d	          Wt          t                     r5
                    d
          r t                    t          |          z    |          V  |
r|d         n|j        dk    r	|j        !|%|| j        vr |          V   ||          V  L|| j        v rt          pd          dz   k|| j        v r| j        |         }|t          t                     s	|| j        v rL|| j        v r"|dk     rd d         t          |          z   t                    t          |          z   |dk     r1dz  dk    r|z  t                    t          |          z   (dz  dk    r|z  8t                    t          |          z   Y|| j        v rn| j        |         \  }} |t          |          |z             V  n9t          t                     s	|| j        v rf|| j        v r/|dk     r) |d d         t          |          z   |z             V  n |t                    t          |          z   |z             V  n|dk     rXdz  dk    r! |t          |z             |z             V  n |t                    t          |          z   |z             V  nWdz  dk    r! |t          |z             |z             V  n- |t                    t          |          z   |z             V  d || j        v rz| j        |         }|t          t                     r!t                    t          |          z   #dz  dk    r|z  3t                    t          |          z   T|| j        v r| j        |         \  }} |t          |          |z             V  t          t                     r/ |t                    t          |          z   |z             V  ҉dz  dk    r" |t          |z             |z             V   |t                    t          |          z   |z             V  ,|| j        v r| j        |         }|Ht          t                     sdk    r; |          }|||z  nd }||j        dk    r	|j         |          V  |dz  dz  }dz  }|||z  z   || j        v r| j        |         \  }} |t          |          |z             V  nt          t                     ro |          }|||z  nd }|.|j        dk    r# |t          |j                  |z             V  n] |          V   |t          |          |z             V  n2dz  dz  }dz  }|||z  z    |t                    |z             V  d || j        v r8 |          V  || j        v s|	r| j        |          ||          V  || j        v r-| j        |          |          V   ||          V  || j        v r| j        |         }t          |t0                    rG||v r' |t                    ||         z             V  d}w |          V   ||          V   |t                    |z             V   ||          V  || j        v r|| j        vr |	s |          V   ||          V  |dk    r'|| j        vr |          V   ||          V  !|dk    s|dk    r}|| j        v s	|| j        v rM|dk    rdnd}| j                            |d          }t          pd          t          |          |z  z   d} |          V   ||          V  |dk    r!|| j        v s|	rt          pd          d
z   t          d|           t          d|            |          V  d S d S )NFr   c                 D    	 t          |           S # t          $ r Y d S w xY wr.   )r   
ValueErrorr;   s    r*   to_fractionz:EnglishNumberNormalizer.process_words.<locals>.to_fraction   s5    {{"   tts    
resultc                 <    t          |           } | z   } d d | S r.   )rU   )r   prefixrw   s    r*   outputz5EnglishNumberNormalizer.process_words.<locals>.output   s,    [[F!&EFMr,   r   rr   z^\d+(\.\d+)?$zConverting the fraction failed.r   0
   r   r   Tr   r   r   ry   r{   r   zUnexpected token: )rU   r   intlenr   rJ   matchr   r   
isinstancer   denominator	numeratorr   r   r   r   r   r   r   r   r   r   r   dictr   getr   )rC   r   skipr   r   r\   currentprevnextnext_is_numeric
has_prefixcurrent_without_prefixfr   suffixr   
multiplierpbeforeresidualrepeatsr   rw   s                        @@r*   process_wordsz%EnglishNumberNormalizer.process_words   s      $+/	3 	 	 	 		5c? 	 	 	 	 	 	 	 u::??F#E** C	A C	AJAw#$665Q<<tD#$E

Q#6#65Q<<DD "$.S28<Ld3S3SO t}4J4>%KWQRR[[G"x(*@AA yAK 6779$%EFFF$!%-- ,%..2E2E , #E

S\\ 9 $fUmm+++'1=v=A%%KEE2EE
**$ &--'''fWoo%%%%DJ&&EKR((3.DI%%y)= EEs++ 7tty/@/@ty((TBYY %crc
SYY 6 #E

SYY 6BYYrzQ #E

SYY 6s{a'' #E

SYY 6D...#1':f= &TV!3444444s++ Ftty/@/@ty((TBYY$fU3B3Z#d))%;f%DEEEEEE$fSZZ#d))%;f%DEEEEEEBYYrzQ$fS%6%6%?@@@@@@$fSZZ#d))%;f%DEEEEEEs{a''$fS%6%6%?@@@@@@$fSZZ#d))%;f%DEEEEEDI%%y)= EEs++ 7JJT2EEs{a'' #E

SYY 6D...#1':f= &TV!3444444s++ F &Uc$ii!7&!@AAAAAAs{a''$fS%6%6%?@@@@@@$fSZZ#d))%;f%DEEEEEED,,,!-g6
=&EEs++ ;uzz#E**A*+-JTA}!);); !$fUmm+++ *"d]T1F$t|H"X
%::EED555%)%>w%G"
F= &Z6!9::::::s++ 6#E**A*+-JTA}!););$fS%5%5%>??????$fUmm+++$fS__v%=>>>>>>"d]T1F$t|H"X
%::E &Uf!455555D444$ &--'''4:%%%!5g>FF &//))))D444$!5g>F &--'''' &//))))DN**$!^G4F!&$// :6>>"(&UfTl)B"C"CCCC#'DD"(&--///"(&//1111$fSZZ&%8999999 &//))))DM))tz))/)($fUmm+++ &//))))%%4#333 ,"(&--///$fWoo---((Gx,?,?ty((DDJ,>,>'.(':':!!#y}}T155 #EKR 0 03t99w3F F# ,"(&--///$fWoo----''t},,, #EKR 0 03 6 %%C'%C%CDDD !!?g!?!?@@@&-- r,   r   c                    g }t          j        d|          }t          |          D ]\  }}t          |                                          dk    r+|t          |          dz
  k    r|                    |           W|                    |           |                    d          d         }|| j        v s	|| j        v r|                    d           |                    d           d		                    |          }t          j
        d
d|          }t          j
        dd|          }t          j
        dd|          }|S )Nz\band\s+a\s+half\br   rr   ry   )maxsplitr   z
point fivez
and a halfr$   z([a-z])([0-9])z\1 \2z([0-9])([a-z])z([0-9])\s+(st|nd|rd|th|s)\b\1\2)rJ   splitr   r   stripappendrsplitr   r   r5   rK   )rC   r   resultssegmentsr\   segment	last_words          r*   
preprocessz"EnglishNumberNormalizer.preprocess  sA   81155#H-- 	1 	1JAw7==??##q((CMMA%%%w''''w'''#NNAN66r:	--d>N1N1NNN<0000NN<0000HHW F$h22F$h22 F17A>>r,   c                     dt           fd}dt           fd}t          j        d||          }t          j        d||          }t          j        dd|          }|S )Nmc                     	 |                      d          }|                      d          }t          |                      d                    }| | d|dS # t          $ r
 | j        cY S w xY w)Nrr   ry   r{   r   02d)groupr   r   string)r  currencyintegerr   s       r*   combine_centsz:EnglishNumberNormalizer.postprocess.<locals>.combine_cents  s|     771::''!**AGGAJJ"9G99e9999      x s   AA A,+A,c                 |    	 dt          |                     d                     S # t          $ r
 | j        cY S w xY w)Nr   rr   )r   r  r   r  )r  s    r*   extract_centsz:EnglishNumberNormalizer.postprocess.<locals>.extract_cents  sK     -C

OO---      x s   $' ;;u,   ([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\bu   [€£$]0.([0-9]{1,2})\bz	\b1(s?)\bzone\1)r   rJ   rK   )rC   r   r   r"  s       r*   postprocessz#EnglishNumberNormalizer.postprocess  s}    	 U 	  	  	  	 	 U 	  	  	  	  FBMSTUUF.qAA F<1--r,   c                     |                      |          }d                    d |                     |                                          D                       }|                     |          }|S )Nr$   c              3      K   | ]}||V  	d S r.   r/   )r0   words     r*   r2   z3EnglishNumberNormalizer.__call__.<locals>.<genexpr>  s'      XXdtGWTGWGWGWGWXXr,   )r  r5   r  r  r#  rO   s     r*   rP   z EnglishNumberNormalizer.__call__  sa    OOAHHXXd&8&8&C&CXXXXXQr,   )rQ   rR   rS   __doc__rD   r   rU   r   r  r  r#  rP   __classcell__)r   s   @r*   rW   rW   `   s         h- h- h- h- h-T] 49 ] # ]  ]  ]  ] ~C    :S    2#        r,   rW   c                   $    e Zd ZdZd ZdefdZdS )EnglishSpellingNormalizerz~
    Applies British-American spelling mappings as listed in [1].

    [1] https://www.tysto.com/uk-us-spelling-list.html
    c                     || _         d S r.   )r   rC   english_spelling_mappings     r*   rD   z"EnglishSpellingNormalizer.__init__  s    /r,   r   c                 j     d                      fd|                                D                       S )Nr$   c              3   N   K   | ]}j                             ||          V   d S r.   )r   r   )r0   r&  rC   s     r*   r2   z5EnglishSpellingNormalizer.__call__.<locals>.<genexpr>  s5      KK((t44KKKKKKr,   )r5   r  rO   s   ` r*   rP   z"EnglishSpellingNormalizer.__call__  s2    xxKKKKKKKKKKr,   N)rQ   rR   rS   r'  rD   rU   rP   r/   r,   r*   r*  r*    sO         0 0 0L# L L L L L Lr,   r*  c                        e Zd Zd ZdefdZdS )EnglishTextNormalizerc                 d   d| _         i dddddddd	d
ddddddddddddddddddddddd d!d"i d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdLdQdRdS| _        t                      | _        t	          |          | _        d S )TNz\b(hmm|mm|mhm|mmm|uh|um)\bz	\bwon't\bzwill notz	\bcan't\bzcan notz	\blet's\bzlet usz	\bain't\baintz	\by'all\bzyou allz	\bwanna\bzwant toz	\bgotta\bzgot toz	\bgonna\bzgoing toz\bi'ma\bzi am going toz\bimma\bz
\bwoulda\bz
would havez
\bcoulda\bz
could havez\bshoulda\bzshould havez	\bma'am\bmadamz\bmr\bzmister z\bmrs\bzmissus z\bst\bzsaint z\bdr\bzdoctor z\bprof\bz
professor z\bcapt\bzcaptain z\bgov\bz	governor z\bald\bz	alderman z\bgen\bzgeneral z\bsen\bzsenator z\brep\bzrepresentative z\bpres\bz
president z\brev\bz	reverend z\bhon\bz
honorable z\basst\bz
assistant z	\bassoc\bz
associate z\blt\bzlieutenant z\bcol\bzcolonel z\bjr\bzjunior z\bsr\bzsenior zesquire z	 had beenz	 has beenz	 had gonez	 has gonez	 had donez has gotz notz arez isz wouldz willz havez am)z\besq\bz	'd been\bz	's been\bz	'd gone\bz	's gone\bz	'd done\bz's got\bzn't\bz're\bz's\bz'd\bz'll\bz't\bz've\bz'm\b)ignore_patterns	replacersrW   standardize_numbersr*  standardize_spellingsr,  s     r*   rD   zEnglishTextNormalizer.__init__  s&   <6
*6
 )6
 (	6

 &6
 )6
 )6
 (6
 *6
 6
 6
 <6
 <6
 M6
 '6
" y#6
$ 	%6
& x'6
 6
( y)6
* +6
, -6
. /6
0 16
2 
36
4 
56
6 )76
8 96
: ;6
< =6
> ?6
@ ,A6
B }C6
D 
E6
F yG6
H yI6
 6
J #%%%%%#k6
 6
 6
n $;#<#< %>?W%X%X"""r,   r   c                    |                                 }t          j        dd|          }t          j        dd|          }t          j        | j        d|          }t          j        dd|          }| j                                        D ]\  }}t          j        |||          }t          j        dd|          }t          j        dd	|          }t          |d
          }|                     |          }|                     |          }t          j        dd	|          }t          j        dd|          }t          j        dd|          }|S )NrF   r   rG   z\s+''z	(\d),(\d)r  z\.([^0-9]|$)z \1u
   .%$¢€£)r)   u   [.$¢€£]([^0-9])z	([^0-9])%z\1 rH   r$   )	rI   rJ   rK   r5  r6  r   r7   r7  r8  )rC   r   patternreplacements       r*   rP   zEnglishTextNormalizer.__call__=  s5   GGIIF'Q//F?B**F4'Q//F7C##$(N$8$8$:$: 	0 	0 G[wQ//AAF<!,,F?FA..)!,???$$Q''&&q)) F)6155F<++F63""r,   N)rQ   rR   rS   rD   rU   rP   r/   r,   r*   r1  r1     sB        :Y :Y :Yx#      r,   r1  )r   )rJ   r&   collections.abcr   	fractionsr   r   typingr   r   rL   r%   rU   r7   r<   r>   rW   r*  r1  r/   r,   r*   <module>r@     sB    
			     $ $ $ $ $ $             " " " " " " " " $$ 	# 	#	
 	$ 	$ 	$ 
4 	# 	# 	# 	# 	$ 	$ 	#  	#! (S SS S S S S.pc p p p p       &O O O O O O O OdL L L L L L L LU U U U U U U U U Ur,   