
    Pix2                         d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	 ddl
mZ ddlmZ dej        ej        ef         dej        eddf         fd	Z G d
 d          Z G d d          ZdS )z
Tokenizer of Unicode characters, grapheme clusters and tailored grapheme clusters
(of orthographies) given an orthography profile.
    N)reader)nfdgrapheme_pattern)errors)Profilepreturnc              #   @  K   t          j        |                               d          5 }|                                D ]D}|                                }|r,|                    d          st          j        d|          V  E	 d d d            d S # 1 swxY w Y   d S )Nzutf-8)encoding#NFD)pathlibPathopen	readlinesstrip
startswithunicodedata	normalize)r   fplines      f/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/segments/tokenizer.py	iterlinesr      s      	a		w		/	/ 92LLNN 	9 	9D::<<D 9DOOC00 9!+E488888	99 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9s   ABBBc                   V    e Zd ZdZdej        eef         fdZedd            Z	d Z
dS )	Ruleszo
    Rules are given in tuple format, comma delimited.
    Regular expressions are given in Python syntax.
    rulesc                 (    d |D             | _         d S )Nc                 @    g | ]\  }}t          j        |          |fS  )regexcompile).0rulereplacements      r   
<listcomp>z"Rules.__init__.<locals>.<listcomp>   s,    YYY>OdKd++[9YYY    )_rules)selfr   s     r   __init__zRules.__init__   s    YYSXYYYr&   r	   c                 t     | t          t          t          t          |                                         S N)listr   r   )clsfnames     r   	from_filezRules.from_file!   s/    sDYu%5%5 6 6778899r&   c                 L    | j         D ]\  }}|                    ||          }|S r+   )r'   sub)r(   sr#   r$   s       r   applyzRules.apply%   s2    !% 	) 	)D+a((AAr&   N)r	   r   )__name__
__module____qualname____doc__typingTuplestrr)   classmethodr/   r3   r   r&   r   r   r      sx         Zv|CH5 Z Z Z Z : : : [:    r&   r   c                      e Zd ZdZddej        ej        ej        fdej	        e
gej        e
         f         dej	        e
gej        e
         f         dej	        e
gej        e
         f         fdZej        dddd	d
fde
de
dej        ej        d                  dedej        d         de
fdZdde
fdZd Zej        ej        fdZd Zd ZdS )	TokenizeruW  
    Class for Unicode character and grapheme tokenization.

    This class provides extended functionality for
    orthography-specific tokenization with orthography profiles.

    Parameters
    ----------

    profile : string or pathlib.Path or Profile instance (default = None)
        Specifies an orthography profile to use.

    rules : string (default = None)
        Filename of a rules file.

    Notes
    -----
    The tokenizer can be used for pure Unicode character and grapheme
    tokenization, i.e. it uses the Unicode standard grapheme parsing rules, as
    implemented in the Python regex package by Matthew Barnett, to do basic tokenization
    with the "\X" grapheme regular expression match. This grapheme match
    combines one or more Combining Diacritical Marks to their base character.
    These are called "grapheme clusters" in Unicode parlance. With these functions
    the Tokenizer is meant to do basic rudimentary parsing for things like generating
    unigram models (segments and their counts) from input data.

    When a profile is passed, the Tokenizer reads the orthography profile and calls a helper
    class to build a tree data structure, which stores the possible Unicode
    character combinations that are specified in the orthography profile
    (i.e. tailored grapheme clusters) that appear in the data source.

    For example, an orthography profile might specify that in source X
    <uu> is a single grapheme (Unicode parlance: tailored grapheme) and
    therefore it should be chunked as so. Given an orthography profile and
    some data to tokenize, the process would look like this:

    input string example: uubo uubo
    output string example: uu b o # uu b o

    >>> prf = Profile({'Grapheme': 'uu'}, {'Grapheme': 'b'}, {'Grapheme': 'o'})
    >>> t = Tokenizer(profile=prf)
    >>> t('uubo uubo')
    'uu b o # uu b o'

    See also the test orthography profile and rules in the test directory.

    An additional method "combine_modifiers" handles the case where there are
    Unicode Spacing Modifier Letters, which are not explicitly
    combined to their base character in the Unicode Standard. These graphemes
    are called "Tailored grapheme clusters" in Unicode. For more information
    see the Unicode Standard Annex #29: Unicode Text Segmentation:

    * http://www.unicode.org/reports/tr29/

    Additionally, the Tokenizer provides functionality to transform graphemes
    into associated character(s) specified in additional columns in the orthography
    profile. A dictionary is created that keeps a mapping between source-specific
    graphemes and their counterparts (e.g. an IPA column in the orthography profile).

    Lastly, the Tokenizer can be used to transform text as specified in an
    orthography rules file. These transformations are specified in a separate
    file from the orthography profile (that specifics the document specific graphemes,
    and possibly their IPA counterparts) and the orthography rules should
    be applied to the output of a grapheme tokenization.

    In an orthography rules file, rules are given in order as regular
    expressions, e.g. this rule replaces a vowel followed by an <n>
    followed by <space> followed by a second vowel with first vowel
    <space> <n> <space> second vowel, e.g.::

        $ (a|á|e|é|i|í|o|ó|u|ú)(n)(\s)(a|á|e|é|i|í|o|ó|u|ú), \1 \2 \4

    Nerrors_stricterrors_replaceerrors_ignorec                    d | _         t          |t                    r|| _         n|t          j        |          | _         |sO| j         rH| j         j        r<| j         j        j        | j         j        j        dz   z  }|                                r|}|rt                              |          nd | _	        |||d| _
        d S )Nz.rules)strictreplaceignore)op
isinstancer   r/   r.   parentstemexistsr   r'   _errors)r(   profiler   r>   r?   r@   r'   s          r   r)   zTokenizer.__init__u   s     gw'' 	1DGG '00DG 	 	TW] 	W])TW]-?(-JKF}} 05?eooe,,,4#%#
 
r&   F  # rC   stringcolumnform)NFCNFKCr   NFKDipar   )rC   rB   rD   r	   c           
         
 g }|                                 D ]}	|rI|                                                              t	          |	                                         M j        r7|                                         |	| j        |                              |                                         t	          |	                                fd
|                    
fd|D                       S )a  
        The main task of a Tokenizer is tokenizing! This is what happens when called.

        This function determines what to do given any combination
        of orthography profile and rules or not orthography profile
        or rules.

        Parameters
        ----------
        string : str
            The input string to be tokenized.

        column : str (default = "graphemes")
            The column label for the transformation, if specified.

        form : None or unicode normalization form
            Normalize return value if form is not None.

        ipa : bool
            Tokenize IPA (work in progress)

        Returns
        -------
        result : str
            Result of the tokenization.

        )rO   errorc                                          |                                           }j        rj                            |          n|}rt	          j        |          n|S r+   )joinr   r'   r3   r   r   )wordresrP   segment_separatorr(   s     r   ppzTokenizer.__call__.<locals>.pp   s`    #((..4466C,0K@$+##C(((SC7;D;(s333Dr&   c              3   .   K   | ]} |          V  d S r+   r   )r"   rY   r\   s     r   	<genexpr>z%Tokenizer.__call__.<locals>.<genexpr>   s+      774bbhh777777r&   )	splitappendcombine_modifiersgrapheme_clustersr   rE   	transformrJ   rX   )r(   rN   rO   rP   rT   r[   	separatorr   rZ   rY   r\   s   `  ` `    @r   __call__zTokenizer.__call__   s+   F LLNN 	B 	BD B

411$2H2HT2S2STTUUUU7 BJJtF$,vBVWWY Y Y Y JJt55c$ii@@AAAA	E 	E 	E 	E 	E 	E 	E
 ~~77773777777r&   c                     |                     fdt          |                                          D                       S )a  
        Given a string as input, return a space-delimited string of Unicode characters
        (code points rendered as glyphs).
        Parameters
        ----------
        string : str
            A Unicode string to be tokenized into graphemes.
        Returns
        -------
        result : str
            String returned is space-delimited on Unicode characters and contains "#" to
            mark word boundaries.
            The string is in NFD.
        Notes
        -----
        Input is first normalized according to Normalization Ford D(ecomposition).
        String returned contains "#" to mark word boundaries.
        c              3   B   K   | ]}                     |          V  d S r+   )rX   )r"   rY   r[   s     r   r^   z'Tokenizer.characters.<locals>.<genexpr>   s2      [[t/44T::[[[[[[r&   )rX   r   r_   )r(   rN   r[   rd   s     ` r   
characterszTokenizer.characters   s>    & ~~[[[[s6{{GXGXGZGZ[[[[[[r&   c                 *    t          j        |          S )a  
        See: Unicode Standard Annex #29: UNICODE TEXT SEGMENTATION
        http://www.unicode.org/reports/tr29/

        Given a string as input, return a list of Unicode graphemes using the
        "\X" regular expression.

        Parameters
        ----------
        word : str
            A Unicode string to be tokenized into graphemes.

        Returns
        -------
        result : list
            List of Unicode graphemes in NFD.

        )r   findallr(   rY   s     r   rb   zTokenizer.grapheme_clusters   s    (  '---r&   c                 ,   | j         s
J d            |t          j        k    r0|| j         j        vr"t	          d                    |                    | j         j                            ||          }|t          j        k    r|S g }|D ]}	 | j         j        |         |         }n&# t          $ r  | j
        d         |          }Y nw xY w|Gt          |t          t          f          r|                    |           v|                    |           |S )a  
        Transform a string's graphemes into the mappings given in a different column
        in the orthography profile.

        Parameters
        ----------
        word : str
            The input string to be tokenized.

        column : str (default = "Grapheme")
            The label of the column to transform to. Default it to tokenize with
            orthography profile.

        Returns
        -------
        result : list of lists
            Result of the transformation.

        z3method can only be called with orthography profile.z Column {0} not found in profile.rC   )rE   r   GRAPHEME_COLcolumn_labels
ValueErrorformattreeparse	graphemesKeyErrorrJ   rF   tupler,   extendr`   )r(   rY   rO   rV   outtokentargets          r   rc   zTokenizer.transform   s-   ( wMMMMMwW)))fDG<Q.Q.Q?FFvNNOOOw|!!$..W)))K 		' 		'E8*51&9 8 8 80i0778!fudm44 'JJv&&&&JJv&&&
s   B$$ CCc                 H    | j         r| j                             |          n|S )aS  
        Function to tokenize input string and return output of str with ortho rules
        applied.

        Parameters
        ----------
        word : str
            The input string to be tokenized.

        Returns
        -------
        result : str
            Result of the orthography rules applied to the input str.

        )r'   r3   rk   s     r   r   zTokenizer.rules  s&      +/+?t{  &&&4?r&   c                 .   g }d}t          |          }t          |          D ]Q}|dz  }t          |          dk    rVt          j        |          dk    r>t	          |          dvr-t          |          dk    r||z   }|dk    r||d         z   |d<   qt          |          dk    r$t	          |          dv r|r||d         z   |d<   d}t          |          dk    r|t          j        |          dk    rdt          |          dk    r|                    |           d}t          j        |d         d                   dk    r||z   |d         z   |d<   d}7|                    ||z              d}S|ddd         }d}g }|t          |          k     r}t	          ||         d                   d	v r-|                    ||         ||dz            z              |d
z  }n |                    ||                    |dz  }|t          |          k     }|S )al  
        Given a string that is space-delimited on Unicode grapheme clusters,
        group Unicode modifier letters with their preceding base characters,
        deal with tie bars, etc.

        Parameters
        ----------
        string : str
            A Unicode string tokenized into grapheme clusters to be tokenized into simple
            IPA.

            Lm)i  i  r   SkN)ia  i\     )lenreversedr   categoryordr`   )	r(   rs   resulttempcountgraphemesegmentsirs	            r   ra   zTokenizer.combine_modifiers&  sE    I ++ 	 	HQJE8}}!!k&:8&D&D&L&LMMZ77C	NNQ<N<N$ A::!%r
!2F2J 8}}!!s8}}
'B'B'B%r
2r
 8}}!!k&:8&D&D&L&Lv;;!##MM(+++D"+F2JqM::dBB%-_vbz%Ar
! MM(T/***DD $$B$<#h--8A;r?##z11!xA6777Q!%%%Q #h-- r&   )rL   rM   )r4   r5   r6   r7   r   rB   rC   rD   r8   Callabler:   Optionalr)   r   rm   Literalboolre   rh   rb   rc   r   ra   r   r&   r   r=   r=   +   s       H HT OU}PVP^OU}
 
 !'vs7K0K L
 "(#8L1L!M	

 !'vs7K0K L
 
 
 
0  '3W["#& IR38 383838 v~6R'ST38 	38  (EF38 X[38 38 38 38j\ \S \ \ \ \*. . ., &-%9 ' ' ' 'R@ @ @$= = = = =r&   r=   )r7   r8   r   r   r    csvw.dsvr   segments.utilr   r   r   r   segments.profiler   Unionr   r:   	Generatorr   r   r=   r   r&   r   <module>r      s)                 / / / / / / / /       $ $ $ $ $ $9glC/0 9V5Ec4QUo5V 9 9 9 9       $x x x x x x x x x xr&   