
     `i,                     p    d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
 g d	Z G d
 de          ZdgZdS )z
Processor class for EVOLLA.
    N)OptionalUnion   )BatchFeature)ProcessorMixin   )AutoTokenizer)aa_seqfoldseekmsac            
       4    e Zd ZdZddgZdgZdZdZdZd fd		Z	dd
Z
	 ddefdZ	 	 	 	 ddeeee         ef                  deeeee                  ee         f                  dee         dee         fdZd Zd Zd Zd Z fdZe fd            Z xZS )EvollaProcessoran  
    Constructs a EVOLLA processor which wraps a LLama tokenizer and SaProt tokenizer (EsmTokenizer) into a single processor.

    [`EvollaProcessor`] offers all the functionalities of [`EsmTokenizer`] and [`LlamaTokenizerFast`]. See the
    docstring of [`~EvollaProcessor.__call__`] and [`~EvollaProcessor.decode`] for more information.

    Args:
        protein_tokenizer (`EsmTokenizer`):
            An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
        tokenizer (`LlamaTokenizerFast`, *optional*):
            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
        protein_max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequence to be generated.
        text_max_length (`int`, *optional*, defaults to 512):
            The maximum length of the text to be generated.
    protein_tokenizer	tokenizersequence_max_lengthr	   N      c                     |t          d          |t          d          t                                          ||           d| j        _        || _        || _        d S )Nz+You need to specify an `protein_tokenizer`.z"You need to specify a `tokenizer`.z<|reserved_special_token_0|>)
ValueErrorsuper__init__r   	pad_tokenprotein_max_lengthtext_max_length)selfr   r   r   r   kwargs	__class__s         /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/evolla/processing_evolla.pyr   zEvollaProcessor.__init__;   sh    $JKKKABBB*I666#A "4.    c                 *   g }|D ]n}|                     d          }|                     d          }d                    d t          ||          D                       }|                    |           o| j                            |dd|d          }|S )Nr
   r    c                 f    g | ].\  }}|                                 |                                z   /S  )upperlower).0sfs      r   
<listcomp>z4EvollaProcessor.process_proteins.<locals>.<listcomp>L   s1    "["["[TQ17799qwwyy#8"["["[r   ptT)return_tensors
truncation
max_lengthpadding)getjoinzipappendr   batch_encode_plus)	r   proteinsr   sa_sequencesproteinr
   r   sa_sequence	sa_tokenss	            r   process_proteinsz EvollaProcessor.process_proteinsG   s     	- 	-G[[**F{{:..H''"["[SQYEZEZ"["["[\\K,,,,*<<$K]gk = 
 
	 r   r   c                     g }|D ]4}| j                             |dd          }|                    |           5|                      |dddd|          }|S )NFT)tokenizeadd_generation_promptr*   longest)add_special_tokensr+   r.   r,   r-   )r   apply_chat_templater2   )r   textsr   promptsmessagespromptprompt_inputss          r   process_textzEvollaProcessor.process_textT   s    
  	# 	#H^77&* 8  F
 NN6""""$& ' 
 
 r   r4   messages_listr   c                    ||t          d          ||n| j        }||n| j        }t          |t                    r|g}t          |t
          t          f          r%t          |d         t
          t          f          s|g}t          |t
          t          f          r(t          d |D                       st          d          t          |t
          t          f          rFt          d |D                       s-t          dd                    t                     d	|           t          |t
          t          f          r|D ]}t          |t
          t          f          s t          d
t          |           d          t          d |D                       st          d          t          d |D                       st          d |D                       rt          d|           n t          dt          |           d          |                     ||          }|                     ||          }t          |d         |d         |d         |d         d          S )av  This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
        the model.

        Args:
            proteins (`Union[List[dict], dict]`):
                A list of dictionaries or a single dictionary containing the following keys:
                    - `"aa_seq"` (`str`) -- The amino acid sequence of the protein.
                    - `"foldseek"` (`str`) -- The foldseek string of the protein.
            messages_list (`Union[List[List[dict]], List[dict]]`):
                A list of lists of dictionaries or a list of dictionaries containing the following keys:
                    - `"role"` (`str`) -- The role of the message.
                    - `"content"` (`str`) -- The content of the message.
            protein_max_length (`int`, *optional*, defaults to 1024):
                The maximum length of the sequence to be generated.
            text_max_length (`int`, *optional*, defaults to 512):
                The maximum length of the text.

        Return:
            a dict with following keys:
                - `protein_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the protein sequence.
                - `protein_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the protein sequence.
                - `text_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the text sequence.
                - `text_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the text sequence.
        Nz3You need to specify `messages_list` and `proteins`.r   c              3   @   K   | ]}t          |t                    V  d S N
isinstancedictr&   ps     r   	<genexpr>z+EvollaProcessor.__call__.<locals>.<genexpr>   s-      :a:aST:a;N;N:a:a:a:a:a:ar   zUThe proteins should be a list of dictionaries, but not all elements are dictionaries.c              3   l   K   | ]/}t          d  |                                D                       V  0dS )c              3   (   K   | ]}|t           v V  d S rI   )PROTEIN_VALID_KEYS)r&   ks     r   rO   z5EvollaProcessor.__call__.<locals>.<genexpr>.<genexpr>   s(      ::A''::::::r   N)allkeysrM   s     r   rO   z+EvollaProcessor.__call__.<locals>.<genexpr>   sR       ;
 ;
?@C:::::::;
 ;
 ;
 ;
 ;
 ;
r   z2There should be a list of dictionaries with keys: z, z for each protein.But got: z;Each messages in messages_list should be a list instead of .c              3   @   K   | ]}t          |t                    V  d S rI   rJ   r&   ms     r   rO   z+EvollaProcessor.__call__.<locals>.<genexpr>   s,      AA1:a..AAAAAAr   zfEach message in messages_list should be a list of dictionaries, but not all elements are dictionaries.c              3   `   K   | ])}t          |                                          d k    V  *dS )r   N)lenrU   rX   s     r   rO   z+EvollaProcessor.__call__.<locals>.<genexpr>   s5      <<as16688}})<<<<<<r   c              3   d   K   | ]+}t          |                                          d dhk    V  ,dS )rolecontentN)setrU   rX   s     r   rO   z+EvollaProcessor.__call__.<locals>.<genexpr>   sS       D D=>CMMfi%88D D D D D Dr   zlEach message in messages_list should be a list of dictionaries with two keys: 'role' and 'content'.But got: zFThe messages_list should be a list of lists of dictionaries, but it's 	input_idsattention_mask)protein_input_idsprotein_attention_maskr`   ra   )data)r   r   r   rK   rL   listtuplerT   r0   rR   typeanyr9   rE   r   )	r   r4   rF   r   r   r   rB   r8   text_tokenss	            r   __call__zEvollaProcessor.__call__l   s   B }4RSSS3E3Q//W[Wn-<-H//dNb h%% 	" zHmdE]33 	,J}UVGWZ^`eYf<g<g 	,*OMhu.. 	vs:a:aX`:a:a:a7a7a 	vtuuuhu.. 	s ;
 ;
DL;
 ;
 ;
 8
 8
 	 '99/00' '$' '   mdE]33 	)  !(T5M:: v$%tcghpcqcq%t%t%tuuuAAAAAAA $ A   <<8<<<<<  D DBJD D D A A  %/$,/ /   oY]^kYlYlooo   ))(4FGG	''GG%.{%;*34D*E(5"-.>"?	 
 
 
 	
r   c                 &     | j         j        |i |S rI   )r   batch_decoder   argsr   s      r   rl   zEvollaProcessor.batch_decode   s    *t~*D;F;;;r   c                 &     | j         j        |i |S rI   )r   decoderm   s      r   rp   zEvollaProcessor.decode   s    $t~$d5f555r   c                 &     | j         j        |i |S rI   )r   rl   rm   s      r   protein_batch_decodez$EvollaProcessor.protein_batch_decode   s    2t%2DCFCCCr   c                 &     | j         j        |i |S rI   )r   rp   rm   s      r   protein_decodezEvollaProcessor.protein_decode   s    ,t%,d=f===r   c                 ~   | j                             t          j                            || j                             d| j        v }|r| j                            d          nd }|r|| j                            d            t                      j        |fi |}|r|| j        
                    |d           |S )Nr   )r   save_pretrainedospathr0   protein_tokenizer_dir_name
attributesindexremover   insert)r   save_directoryr   protein_tokenizer_presentprotein_tokenizer_indexoutputsr   s         r   rv   zEvollaProcessor.save_pretrained   s    ..rw||NDLk/l/lmmm %84?$J!Pi"s$/"7"78K"L"L"Los$ 	8)@)LO""#6777)%'').CCFCC$ 	Q)@)LO""#:<OPPPr   c                      t                      j        |fi |}t          |t                    r|d         }t	          j        || j                  }||_        |S )Nr   )	subfolder)r   from_pretrainedrK   rf   r	   ry   r   )clspretrained_model_name_or_pathr   	processorr   r   s        r   r   zEvollaProcessor.from_pretrained   so    +EGG+,ITTVTT	 i'' 	%!!I)9)S5S
 
 
 '8	#r   )Nr   r   )r   )r   )NNNN)__name__
__module____qualname____doc__rz   valid_kwargsprotein_tokenizer_classtokenizer_classry   r   r9   intrE   r   r   re   rL   rj   rl   rp   rr   rt   rv   classmethodr   __classcell__)r   s   @r   r   r       s        " &{3J)*L .%O!4
/ 
/ 
/ 
/ 
/ 
/      #     4 7;GK,0)-W
 W
5dT!123W
  d4:&6T
&B CDW
 %SM	W

 "#W
 W
 W
 W
r< < <6 6 6D D D> > >
    (     [    r   r   )r   rw   typingr   r   feature_extraction_utilsr   processing_utilsr   autor	   rR   r   __all__r#   r   r   <module>r      s     
			 " " " " " " " " 4 4 4 4 4 4      !           322 T T T T Tn T T Tn 
r   