
    PiC                         d dl mZmZmZmZmZmZ d dlZd dl	m
Z
 dZ G d de
          Zdeeef         fdZd	eed
f         deeeef                  fdZdedefdZdedeeeef                  fdZdS )    )AnyDictListMappingSetTupleN)BaseTokenizerz</w>c            	           e Zd ZdZddededefdZded	ee         fd
Z	dee         d	efdZ
	 ddeeef         ded	eeef         fdZded	efdZdS )CLIPTokenizerar  
    Text tokenizer for CLIP.

    Based on the official implementation here:
    https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py

    Args:
        path (str): the path to the CLIP merges file
        max_seq_len (int): the context length (all CLIP models use 77)
        truncate (bool): whether to truncate the text when longer than max_seq_len
    M   Tpathmax_seq_lentruncatec                 0   || _         || _        t                      | _        d | j                                        D             | _        t          |          }t          | j                                                  }|	                    d |D                        |	                    d |D                        |	                    ddg           d t          |          D             | _        d | j                                        D             | _        d t          |          D             | _        t          j        d	t          j                  | _        | j        d         | _        | j        d         | _        | j        | _        ddd
| _        d S )Nc                     i | ]\  }}||	S  r   .0kvs      t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/clip/_tokenizer.py
<dictcomp>z*CLIPTokenizer.__init__.<locals>.<dictcomp>!   s    HHHdaQHHH    c                 "    g | ]}|t           z   S r   )WORD_BOUNDARY)r   r   s     r   
<listcomp>z*CLIPTokenizer.__init__.<locals>.<listcomp>&   s    777Aa-'777r   c                 8    g | ]}d                      |          S ) )join)r   merges     r   r   z*CLIPTokenizer.__init__.<locals>.<listcomp>'   s"    999bggenn999r   <|startoftext|><|endoftext|>c                     i | ]\  }}||	S r   r   )r   iwords      r   r   z*CLIPTokenizer.__init__.<locals>.<dictcomp>*   s    @@@GAta@@@r   c                     i | ]\  }}||	S r   r   r   s      r   r   z*CLIPTokenizer.__init__.<locals>.<dictcomp>+   s    >>>A1>>>r   c                     i | ]\  }}||	S r   r   )r   r$   r    s      r   r   z*CLIPTokenizer.__init__.<locals>.<dictcomp>,   s    EEExq%%EEEr   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r!   r"   )r   r   _bytes_to_unicodebyte_encoderitemsbyte_decoder_load_mergeslistvaluesextend	enumerateencoderdecoder	bpe_ranksrecompile
IGNORECASEpat	sot_token	eot_token	pad_tokencache)selfr   r   r   mergesvocabs         r   __init__zCLIPTokenizer.__init__   s   & -//HHd.?.E.E.G.GHHHd##T&--//007777788899&999:::'9:::@@y/?/?@@@>>););)=)=>>>EE9V3D3DEEE:nM
 

 &78o6  1,
 



r   textreturnc                 v    t          |                                          } j        g}t          j         j        |          D ]}d                     fd|                    d          D                       }|                     fd 	                    |          
                    d          D                        t          |           j        k    r n|                     j                   t          |           j        k    r* j        s
J d            |d j                 } j        |d<   |S )	z
        Given a string, return the encoded list of token ids.

        Args:
            text (str): The text to encode.

        Returns:
            List[int]: The encoded list of token ids.
        r   c              3   2   K   | ]}j         |         V  d S N)r)   )r   br<   s     r   	<genexpr>z'CLIPTokenizer.encode.<locals>.<genexpr>J   s+      PPQD-a0PPPPPPr   utf-8c              3   2   K   | ]}j         |         V  d S rD   )r1   )r   	bpe_tokenr<   s     r   rF   z'CLIPTokenizer.encode.<locals>.<genexpr>K   s;        ,5Y'     r    zWTokenized text is larger than the maximum sequence length but truncate is set to False.N)_clean_textlowerr8   r4   findallr7   r   encoder/   _bpesplitlenr   appendr9   r   )r<   r@   tokenstokens   `   r   rO   zCLIPTokenizer.encode<   s[    4  &&((.!Z$// 	 	EGGPPPP%,,w:O:OPPPPPEMM    9=59I9I9O9OPS9T9T      6{{d... /dn%%%v;;)))=  , = .d../FF2Jr   rT   c                      d                      fd|D                       }t           fd|D                                           dd                              t          d          S )z
        Given a list of token ids, return the decoded text, optionally including special tokens.

        Args:
            tokens (List[int]): The list of token ids to decode.

        Returns:
            str: The decoded text.
        r   c                 *    g | ]}j         |         S r   )r2   )r   rU   r<   s     r   r   z(CLIPTokenizer.decode.<locals>.<listcomp>f   s     @@@U+@@@r   c                 *    g | ]}j         |         S r   )r+   )r   cr<   s     r   r   z(CLIPTokenizer.decode.<locals>.<listcomp>h   s!    :::t(+:::r   rG   replace)errorsrJ   )r   	bytearraydecoderZ   r   )r<   rT   r@   s   `  r   r]   zCLIPTokenizer.decode\   sp     ww@@@@@@@AA::::T:::;;VGIV..W]C((	
r   Fsample	inferencec                 `    |                     d          }|                     |          |d<   |S )a]  
        Tokenize the "text" field in the sample.

        Args:
            sample (Mapping[str, Any]): A sample with a "text" field containing a string to tokenize
            inference (bool): Unused by this tokenizer

        Returns:
            Mapping[str, Any]: The sample with added "tokens" field and the "messages" field removed.
        r@   rT   )poprO   )r<   r^   r_   r@   s       r   __call__zCLIPTokenizer.__call__m   s0     zz&!!;;t,,xr   rU   c                     | j         v r j         |         S t          |          dk     r
|t          z   S t          |dd                   |d         t          z   fz   }t	          |          }	 t          | fd          }| j        vrn6|\  }}g }d}|t          |          k     r	 |                    ||          }	|                    |||	                    |	}n-# t          $ r  |                    ||d                    Y nw xY w||         |k    rC|t          |          dz
  k     r-||dz            |k    r|
                    ||z              |dz  }n |
                    ||                    |dz  }|t          |          k     t          |          }t          |          dk    rnt	          |          }Ud	                    |          }| j         |<   |S )
z@
        Performs byte-pair encoding on a single token.
           NrK   Tc                 T    j                             | t          d                    S )Ninf)r3   getfloat)pairr<   s    r   <lambda>z$CLIPTokenizer._bpe.<locals>.<lambda>   s     1C1CD%PU,,1W1W r   )keyr      rJ   )r;   rR   r   tuple
_get_pairsminr3   indexr/   
ValueErrorrS   r   )
r<   rU   r%   pairsbigramfirstsecondnew_wordr$   js
   `         r   rP   zCLIPTokenizer._bpe~   s    DJ:e$$u::>>=(( U3B3Z  E"I$=#?? 4  $	%$W$W$W$WXXXFT^++ #ME6HAc$ii--

5!,,AOOD1I...AA!   OODH---E
 7e##CIIM(9(9d1q5kV>S>SOOEFN333FAAOODG,,,FA# c$ii--$ ??D 4yyA~~ t$$EI$	%L xx~~ 
5s   *5C   'D
	D
N)r   T)F)__name__
__module____qualname____doc__strintboolr?   r   rO   r]   r   r   rb   rP   r   r   r   r   r      s       
 

 
S 
s 
4 
 
 
 
@3 49    @
T#Y 
3 
 
 
 
$ <A c3h'48	c	   ":# :# : : : : : :r   r   rA   c            	      \   t          t          t          d          t          d          dz                       t          t          t          d          t          d          dz                       z   t          t          t          d          t          d          dz                       z   } | dd         }d	}t          d
          D ]8}|| vr2|                     |           |                    d
|z              |dz  }9d |D             }t	          t          | |                    S )zQ
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    !~rl      ¡   ¬   ®   ÿNr      c                 ,    g | ]}t          |          S r   )chr)r   ns     r   r   z%_bytes_to_unicode.<locals>.<listcomp>   s    			Q#a&&			r   )r-   rangeordrS   dictzip)bscsr   rE   s       r   r(   r(      s   
 	U3s88SXX\**++
uSYYD		A..
/
/	0
uSYYD		A..
/
/	0 
 
AAAB	A4[[  B;;IIaLLLIIdQhFA		"			BBr   r%   .c                 ~    t                      }| d         }| dd         D ]}|                    ||f           |}|S )z
    Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   rl   N)setadd)r%   rr   	prev_charchars       r   rn   rn      sP    
 EEEQIQRR  		9d#$$$		Lr   r@   c                 .    |                      dd          S )zI
    Minimal version of CLIP's text cleaning via the `ftfy` package.
    u   ’')rZ   )r@   s    r   rL   rL      s     <<s###r   r   c                 T   g }t          | d          5 }t          |          D ]k\  }}|                                }|dk    r|                    d          s|s7|                    t          |                                                     l	 d d d            n# 1 swxY w Y   |S )NrG   )encodingr   z	#version:)openr0   strip
startswithrS   rm   rQ   )r   r=   fr$   lines        r   r,   r,      s    F	dW	%	%	% / || 	/ 	/GAt::<<DQ4??;77MM%

--....		// / / / / / / / / / / / / / / Ms   A<BB!$B!)typingr   r   r   r   r   r   regexr4   .torchtune.modules.transforms.tokenizers._utilsr	   r   r   r}   r|   r(   rn   rL   r,   r   r   r   <module>r      sB   8 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7     H H H H H Hi i i i iM i i iX4S>    (
U38_ 
U38_)= 
 
 
 
$c $c $ $ $ $s tE#s(O4      r   