
     `ii                        d Z ddlZddlmZmZmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%  e j&        e'          Z(d Z)d Z*d Z+d Z,d Z- G d dej        j.                  Z/ G d dej.                  Z0 G d dej.                  Z1	 	 dHdej.        dej2        dej2        dej2        d eej2                 d!e3d"e3d#eej2                 d$ee         fd%Z4 G d& d'ej.                  Z5 G d( d)ej.                  Z6 G d* d+ej.                  Z7 G d, d-ej.                  Z8 G d. d/ej.                  Z9 G d0 d1e          Z: G d2 d3ej.                  Z; G d4 d5ej.                  Z<e G d6 d7e                      Z=e G d8 d9e=                      Z>e G d: d;e=                      Z? G d< d=ej.                  Z@ ed>?           G d@ dAe=                      ZAe G dB dCe=                      ZB G dD dEej.                  ZCdF ZDg dGZEdS )IzPyTorch ESM model.    N)CallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringcan_return_tuplelogging)OutputRecordercheck_model_inputs   )	EsmConfigc                 h    |                      dd          \  }}t          j        | |fd          S )N   dim)chunktorchcat)xx1x2s      x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/esm/modeling_esm.pyrotate_halfr*   ,   s6    WWQBWFB9rc2YB''''    c                     |d d d d d | j         d         d d f         }|d d d d d | j         d         d d f         }| |z  t          |           |z  z   S )N)shaper*   )r&   cossins      r)   apply_rotary_pos_embr1   1   sn    
aaaMagbkM111$
%C
aaaMagbkM111$
%CGA,--r+   c                 f    | dz  dt          j        | t          j        d          z            z   z  S )zo
    This is the gelu implementation from the original ESM repo. Using F.gelu yields subtly wrong results.
    g      ?      ?g       @)r$   erfmathsqrtr&   s    r)   gelur8   8   s/     s7cEIa$)C..&8999::r+   c                 4    | |                      dd          z   S )zJMake layer symmetric in final two dimensions, used for contact prediction.r    r-   )	transposer7   s    r)   
symmetrizer;   ?   s    q{{2r""""r+   c                     |                      dd          }|                      dd          }|                      dd          }||z  }|                    |           | |z
  }|S )z=Perform average product correct, used for contact prediction.r    T)keepdimsr-   )r    r-   )sumdiv_)r&   a1a2a12avg
normalizeds         r)   average_product_correctrE   D   sh    	
rD	!	!B	
rD	!	!B
%%4%
(
(C
r'CHHSMMMSJr+   c                        e Zd ZU dZej        ed<   def fdZddZ	dej        dej        d	e
ej        ej        f         fd
Z xZS )RotaryEmbeddingz
    Rotary position embeddings based on those in
    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
    matrices which depend on their relative positions.
    inv_freqr"   c                    t                                                       ddt          j        d|dt          j                                                  |z  z  z  }|                     d|           d | _        d | _        d | _	        d S )Nr3   i'  r   r   dtyperH   )
super__init__r$   arangeint64floatregister_buffer_seq_len_cached_cos_cached_sin_cached)selfr"   rH   	__class__s      r)   rM   zRotaryEmbedding.__init__Y   s    %ELC%+$N$N$N$T$T$V$VY\$\]^Z222#r+   r   c                 2   |j         |         }|| j        k    s| j        j        |j        k    r|| _        t	          j        |j         |         |j                                      | j                  }t	          j        || j                  }t	          j	        ||fd          
                    |j                  }|                                d d d d d d f         | _        |                                d d d d d d f         | _        | j        | j        fS )Ndevicer    r!   )r.   rR   rS   rY   r$   rN   type_asrH   outerr%   tor/   r0   rT   )rU   r&   seq_dimensionseq_lentfreqsembs          r)   _update_cos_sin_tablesz&RotaryEmbedding._update_cos_sin_tablesc   s    '-( d***d.>.E.Q.Q#*D QW]3AHEEEMMdm\\AK4=11E)UEN33366qx@@C"wwyytQQQ)9:D"wwyytQQQ)9:D!111r+   qkreturnc                    |                      |d          \  | _        | _        t          || j        | j                                      |j                  t          || j        | j                                      |j                  fS )Nr-   )r]   rJ   )rb   rS   rT   r1   r\   rK   )rU   rc   rd   s      r)   forwardzRotaryEmbedding.forwards   s    -1-H-HZ\-H-]-]*$* !D$4d6FGGJJQRQXJYY D$4d6FGGJJQRQXJYY
 	
r+   )r   )__name__
__module____qualname____doc__r$   Tensor__annotations__intrM   rb   tuplerg   __classcell__rV   s   @r)   rG   rG   P   s           l C            2 2 2 2 
 
%, 
5u|A[;\ 
 
 
 
 
 
 
 
r+   rG   c                   8     e Zd ZdZ	 	 ddedef fdZd Z xZS )	EsmContactPredictionHeadzWPerforms symmetrization, apc, and computes a logistic regression on the output featuresTr   in_featureseos_idxc                     t                                                       || _        || _        t	          j        |d|          | _        t	          j                    | _        d S )Nr   )	rL   rM   rt   ru   r   Linear
regressionSigmoid
activation)rU   rt   biasru   rV   s       r)   rM   z!EsmContactPredictionHead.__init__   sP     	&)KD99*,,r+   c                    |                     | j                                      |          }|                    d          |                    d          z  }||d d d d d d d d f         z  }|dd dd df         }|ddd dd f         }|                                \  }}}}}|                    |||z  ||          }|                    | j        j        j                  }t          t          |                    }|                    dddd          }|                     |                     |                              d                    S )Nr   r   .r    r   r
   )neru   r\   	unsqueezesizeviewrx   weightrY   rE   r;   permuterz   squeeze)	rU   tokens
attentionseos_mask
batch_sizelayersheadsseqlen_s	            r)   rg   z EsmContactPredictionHead.forward   sP   99T\**--j99%%a((8+=+=a+@+@@(111dD!!!QQQ+>"??
SbS#2#.
QRR,
/9/@/@,
FE61__Z%PP
  ]]O")
 

 -Z
-C-CDD
''1a33
tz::BB1EEFFFr+   )Tr   )rh   ri   rj   rk   rn   rM   rg   rp   rq   s   @r)   rs   rs   |   sx        aa
 	
' 
'
' 	
' 
' 
' 
' 
' 
'G G G G G G Gr+   rs   c                   8     e Zd ZdZ fdZ	 	 	 	 ddZd Z xZS )EsmEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t                                                       t          j        |j        |j        |j                  | _        |j        r&t          j	        |j        |j
                  | _        nd | _        t          j        |j                  | _        t          |dd          | _        |                     dt%          j        |j                                      d          d           |j        | _        | j        dk    r+t          j        |j        |j        | j                  | _        |j        | _        |j        | _        d S )	N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   r    F)
persistent)rL   rM   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr   rQ   r$   rN   max_position_embeddingsexpandr   position_embeddingstoken_dropoutmask_token_idrU   configrV   s     r)   rM   zEsmEmbeddings.__init__   s:   !|F,=v?Q_e_rsss' 	# l6+=6CXYYYDOO"DOz&"<=='.v7PR\']']$EL)GHHOOPWXXej 	 	
 	
 	
 ".':55')|.0BPTP`( ( (D$ $1#1r+   Nc                    |-|t          || j                  }n|                     |          }||                     |          }|}| j        r||                    || j        k                        d          d          }d}||                    d          n|j	        d         }|| j        k                        d          
                                |z  }|d|z
  z  d|z
  d d d d f         z                      |j                  }| j        dk    r|                     |          }	||	z   }| j        |                     |          }|0||                    d          z                      |j                  }|S )Nr            gQ?r   r   )"create_position_ids_from_input_idsr   &create_position_ids_from_inputs_embedsr   r   masked_fillr   r~   r>   r.   rP   r\   rK   r   r   r   )
rU   	input_idsattention_maskr   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedr   s
             r)   rg   zEsmEmbeddings.forward   s    $A)TM]^^#JJ=YY  00;;M #
  	)"7#//d>P1P0[0[\^0_0_adeeJ)4B4N.,,R000T]TcdeTfK#,0B#B"G"G"K"K"Q"Q"S"SVa"a$,<(<=EXAXZ[Z[Z[]acgZg@hhll  J ':55"&":":<"H"H#&99J?&44J%$~'?'?'C'CCGG
HXYYJ r+   c                    |                                 dd         }|d         }t          j        | j        dz   || j        z   dz   t          j        |j                  }|                    d                              |          S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr    r   rK   rY   r   )r   r$   rN   r   longrY   r~   r   )rU   r   input_shapesequence_lengthr   s        r)   r   z4EsmEmbeddings.create_position_ids_from_inputs_embeds   s     $((**3B3/%a.|q /D4D"Dq"HPUPZcpcw
 
 
 %%a((//<<<r+   NNNN)rh   ri   rj   rk   rM   rg   r   rp   rq   s   @r)   r   r      st         2 2 2 2 22 / / / /b= = = = = = =r+   r   r   modulequerykeyvaluer   scalingr   	head_maskkwargsc                    t          j        ||                    dd                    |z  }	t          | d          r.| j        dv r$|j        d         }
t          j        |
t           j        |	j                  	                    dd          }t          j        |
t           j        |	j                  	                    dd          }||z
  }| 
                    || j        z   dz
            }|                    |j                  }| j        d	k    rt          j        d
||          }n<| j        dk    r1t          j        d
||          }t          j        d||          }||z   }|	|z   }	|$|d d d d d d d |j        d         f         }|	|z   }	t          j                            |	dt           j                                      |j                  }	t          j                            |	|| j                  }	||	|z  }	t          j        |	|          }|                    dd                                          }||	fS )Nr   r
   r   relative_keyrelative_key_queryr   r    r   rJ   r   zbhld,lrd->bhlrr   zbhrd,lrd->bhlrr-   )r"   rK   )ptraining)r$   matmulr:   hasattrr   r.   rN   r   rY   r   distance_embeddingr   r\   rK   einsumr   
functionalsoftmaxfloat32r   r   
contiguous)r   r   r   r   r   r   r   r   r   attn_weights
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keycausal_maskattn_outputs                       r)   eager_attention_forwardr      sI    <s}}Q':':;;gELv011 ?f6T Y 7 7 [^
j
<K^___ddegijkkj
<K^___ddefhjkk!N2%88FDb9bef9fgg366U[6II)^;;',|4DeMa'b'b$$+/CCC-2\:JESg-h-h*+0<8H#Oc+d+d('EHd'd$#&>>!$QQQ111o	"o%=>#k1=((2U](SSVVW\WbccL=((6?([[L#i/,|U33K''1--88::K$$r+   c                        e Zd Zd fd	Z	 	 	 	 ddej        deej                 deej                 deej                 deej                 d	ee	         d
e
ej                 fdZ xZS )EsmSelfAttentionNFc                    t                                                       || _        |j        |j        z  dk    r0t          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        t          j        |j        | j	                  | _        t          j        |j        | j	                  | _        t          j        |j        | j	                  | _        |j        | _        |pt#          |dd          | _        d | _        | j        dk    s| j        d	k    r7|j        | _        t          j        d
|j        z  dz
  | j                  | _        n%| j        dk    rt/          | j                  | _        d| _        |j        | _        || _        | j        o| | _        d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   r   r   r   r   rotaryr!   r3   )rL   rM   r   r   num_attention_headsr   
ValueErrorrn   attention_head_sizeall_head_sizer   rw   r   r   r   attention_probs_dropout_probr   r   r   rotary_embeddingsr   r   r   rG   r   
is_decoder	layer_idx	is_causal)rU   r   r   r   is_cross_attentionrV   s        r)   rM   zEsmSelfAttention.__init__1  s    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
:'> (
'-zC
 C
$ "&'>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD##)X55%49Q%R%R%RD" +"C1C-Cr+   hidden_statesr   r   encoder_hidden_statesencoder_attention_maskr   re   c                    |j         d d         \  }}||d| j        f}	|                     |                              |	                              dd          }
|d u}|r|n|}|r|n|}|                     |                              |	                              dd          }|                     |                              |	                              dd          }|
| j        dz  z  }
| j        dk    r|                     |
|          \  }
}t          }| j
        j        dk    rE| j        dv r%t          d| j
        j         d	| j         d
          t          | j
        j                 } || |
|||f| j        sdn| j        | j        |d|\  }}|                    ||d                                          }||fS )Nr    r   r   g      r   eagerr   zESM z attention does not support z^ embeddings. Set attention explicitly to 'eager' with `model.set_attn_implementation('eager')`r   )r   r   r   )r.   r   r   r   r:   r   r   r   r   r   r   _attn_implementationr   r   r   r   r   reshaper   )rU   r   r   r   r   r   r   r   r   hidden_shapequery_layerr   current_states	key_layervalue_layerattention_interfacer   r   s                     r)   rg   zEsmSelfAttention.forwardS  s    "/!4SbS!9
J"JD4LMjj//44\BBLLQPQRR2$>2DW..-3EY//>HH^,,11,??II!QOO	jj0055lCCMMaQRSS "D$<d$BB'833%)%;%;K%S%S"K(?;+w66+/UUU h4;; h hY]Yu h h h   #:$+:Z"[$7$7
%
  $}>CC$,L
%
 
%
 
%
 
%
!\ "))*j"EEPPRRL((r+   )NNFr   )rh   ri   rj   rM   r$   rl   r   FloatTensorr   r   ro   rg   rp   rq   s   @r)   r   r   0  s         D  D  D  D  D  DJ 7;15=A>B3) 3)|3) !!233) E-.	3)
  ((9:3) !)): ;3) +,3) 
u|	3) 3) 3) 3) 3) 3) 3) 3)r+   r   c                   $     e Zd Z fdZd Z xZS )EsmSelfOutputc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S N)	rL   rM   r   rw   r   denser   r   r   r   s     r)   rM   zEsmSelfOutput.__init__  sJ    Yv163EFF
z&"<==r+   c                 d    |                      |          }|                     |          }||z   }|S r   r   r   rU   r   input_tensors      r)   rg   zEsmSelfOutput.forward  4    

=11]33%4r+   rh   ri   rj   rM   rg   rp   rq   s   @r)   r   r     G        > > > > >
      r+   r   c                   H     e Zd Zd fd	Zd Z	 	 	 	 ddee         fdZ xZS )	EsmAttentionNFc                    t                                                       t          |||          | _        t	          |          | _        t                      | _        t          j	        |j
        |j                  | _	        d S )N)r   r   r   )rL   rM   r   rU   r   outputsetpruned_headsr   r   r   r   )rU   r   r   r   rV   s       r)   rM   zEsmAttention.__init__  sk    $VyUghhh	#F++EEf&8f>STTTr+   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r!   )lenr   rU   r   r   r  r   r   r   r   r
  r   r   union)rU   r   indexs      r)   prune_headszEsmAttention.prune_heads  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r+   r   c                     |                      |          } | j        |f||||d|\  }}	|                     ||          }|S )Nr   r   r   r   )r   rU   r
  )
rU   r   r   r   r   r   r   hidden_states_lnr   r   s
             r)   rg   zEsmAttention.forward  sk      >>-88"
)"7#9
 
 
 
Q kk+}==r+   )NFr   )	rh   ri   rj   rM   r  r   r   rg   rp   rq   s   @r)   r  r    s        U U U U U U; ; ;* "#  +,       r+   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )EsmIntermediatec                     t                                                       t          j        |j        |j                  | _        d S r   )rL   rM   r   rw   r   intermediate_sizer   r   s     r)   rM   zEsmIntermediate.__init__  s6    Yv163KLL


r+   r   re   c                 N    |                      |          }t          |          }|S r   )r   r8   )rU   r   s     r)   rg   zEsmIntermediate.forward  s&    

=11]++r+   rh   ri   rj   rM   r$   rl   rg   rp   rq   s   @r)   r  r    sc        M M M M MU\ el        r+   r  c                   $     e Zd Z fdZd Z xZS )	EsmOutputc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S r   )
rL   rM   r   rw   r  r   r   r   r   r   r   s     r)   rM   zEsmOutput.__init__  sJ    Yv79KLL
z&"<==r+   c                 d    |                      |          }|                     |          }||z   }|S r   r  r  s      r)   rg   zEsmOutput.forward  r  r+   r  rq   s   @r)   r  r    r  r+   r  c                   F     e Zd Z fdZ	 	 	 	 ddee         fdZd Z xZS )EsmLayerc                    t                                                       |j        | _        d| _        t	          |          | _        |j        | _        |j        | _        | j        r/| j        st          |  d          t	          |d          | _	        t          |          | _        t          |          | _        t          j        |j        |j                  | _        d S )Nr   z> should be used as a decoder model if cross attention is addedT)r   r   )rL   rM   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionRuntimeErrorcrossattentionr  intermediater  r
  r   r   r   r   r   s     r)   rM   zEsmLayer.__init__  s    '-'E$%f-- +#)#= # 	P? l"d#j#j#jkkk".v$"O"O"OD+F33''f&8f>STTTr+   Nr   c                      | j         |f||d|}| j        r8|6t          | d          st          d|  d           | j        |f||||d|}|                     |          }|S )N)r   r   r'  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r  )r$  r   r   AttributeErrorr'  feed_forward_chunk)	rU   r   r   r   r   r   r   attention_outputlayer_outputs	            r)   rg   zEsmLayer.forward  s     *4>
)
 
 	
 
 ? 	4@4!122 $`d ` ` `  
  3t2  -#&;'=        ../?@@r+   c                     |                      |          }|                     |          }|                     ||          }|S r   )r   r(  r
  )rU   r,  attention_output_lnintermediate_outputr-  s        r)   r+  zEsmLayer.feed_forward_chunk  sE    "nn-=>>"//0CDD{{#68HIIr+   r   )	rh   ri   rj   rM   r   r   rg   r+  rp   rq   s   @r)   r   r     s        U U U U U$ "#! ! +,! ! ! !F      r+   r   c                   P     e Zd Z fdZe	 	 	 	 ddee         fd            Z xZS )
EsmEncoderc                    t                                                       | _        t          j        fdt          j                  D                       | _        t          j        j	        j
                  | _        d| _        d S )Nc                 .    g | ]}t                    S  )r   ).0r   r   s     r)   
<listcomp>z'EsmEncoder.__init__.<locals>.<listcomp>  s!    #^#^#^HV$4$4#^#^#^r+   r   F)rL   rM   r   r   
ModuleListrangenum_hidden_layerslayerr   r   r   emb_layer_norm_aftergradient_checkpointingr   s    `r)   rM   zEsmEncoder.__init__  s}    ]#^#^#^#^eFD\>]>]#^#^#^__
$&L1CI^$_$_$_!&+###r+   Nr   c           	          t          | j                  D ]\  }}|||         nd }	 ||f||	||d|} | j        r|                     |          }t          |          S )Nr  )last_hidden_state)	enumerater;  r<  r   )
rU   r   r   r   r   r   r   ilayer_modulelayer_head_masks
             r)   rg   zEsmEncoder.forward   s      )44 		 		OA|.7.CillO(L-)&;'=   MM $ 	E 55mDDM1MRRRRr+   r   )	rh   ri   rj   rM   r   r   r   rg   rp   rq   s   @r)   r2  r2    s        , , , , ,  "#S S +,S S S S S S S Sr+   r2  c                   B     e Zd Z fdZdej        dej        fdZ xZS )	EsmPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r   )rL   rM   r   rw   r   r   Tanhrz   r   s     r)   rM   zEsmPooler.__init__=  sC    Yv163EFF
'))r+   r   re   c                 r    |d d df         }|                      |          }|                     |          }|S Nr   )r   rz   )rU   r   first_token_tensorpooled_outputs       r)   rg   zEsmPooler.forwardB  s@     +111a40

#56666r+   r  rq   s   @r)   rE  rE  <  s^        $ $ $ $ $
U\ el        r+   rE  c                       e Zd ZU eed<   dZdZdZg dZdgZ	dZ
dZdZdZe eedd	          g eedd
	          gdZd Zd ZdS )EsmPreTrainedModelr   esmTF)r   #EsmFoldTriangularSelfAttentionBlockr   zposition_embeddings.weightr   r$  )r  
layer_namer'  )r   r   cross_attentionsc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS t          |t                    r |j        j        	                                 dS dS )zInitialize the weightsr   )meanstdNr3   )
isinstancer   rw   r   datanormal_r   initializer_ranger{   zero_r   r   r   fill_	EsmLMHead)rU   r   s     r)   _init_weightsz EsmPreTrainedModel._init_weightsa  sX   fbi(( 	% M&&CT[5R&SSS{& &&((((( '&-- 	%M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	%K""$$$M$$S)))))	** 	%K""$$$$$	% 	%r+   c                     d S r   r5  rU   s    r)   get_output_embeddingsz(EsmPreTrainedModel.get_output_embeddingss  s	     tr+   N)rh   ri   rj   r   rm   base_model_prefixsupports_gradient_checkpointingaccepts_loss_kwargs_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   r   _can_record_outputsr\  r_  r5  r+   r)   rM  rM  K  s         &*#\\\*F)G&N"& "%~&6aKXXXYN+1AQRRR
 % % %$    r+   rM  c                   d    e Zd ZdZd fd	Zd Zd Zd Zee		 	 	 	 	 	 	 dde
ej                 d	e
ej                 d
e
ej                 de
ej                 de
ej                 de
ej                 de
ej                 dee         deeej                 ef         fd                        Zd Z xZS )EsmModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    Tc                 F   t                                          |           || _        t          |          | _        t          |          | _        |rt          |          nd| _        t          |j
        |j        z  d          | _        |                                  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        NT)rt   r{   )rL   rM   r   r   r   r2  encoderrE  poolerrs   r:  r   contact_head	post_init)rU   r   add_pooling_layerrV   s      r)   rM   zEsmModel.__init__  s    
 	   '//!&))+<Fi'''$4063MMTX
 
 

 	r+   c                     | j         j        S r   r   r   r^  s    r)   get_input_embeddingszEsmModel.get_input_embeddings  s    ..r+   c                     || j         _        d S r   rs  )rU   r   s     r)   set_input_embeddingszEsmModel.set_input_embeddings  s    */'''r+   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrm  r;  r$  r  )rU   heads_to_pruner;  r   s       r)   _prune_headszEsmModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr+   Nr   r   r   r   r   r   r   r   re   c                    |du |duz  rt          d          ||                     ||          }| j        j        dk    rJ|j        dd         \  }	}
|t          j        |	|
f|j                  }|                     ||	|
f          }| j        j	        rQ|O|
                                \  }}}||f}|t          j        ||j                  }|                     |          }nd}|                     || j        j                  } | j        |f||||d|}|d	         }| j        |                     |          nd}t!          ||
          S )aV  
        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length), hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   flash_attention_2r    rX   )r   r  r   )r?  pooler_output)r   r   r   r   r.   r$   onesrY   get_extended_attention_maskr   r   invert_attention_maskget_head_maskr:  rm  rn  r   )rU   r   r   r   r   r   r   r   r   r   r   encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputssequence_outputrK  s                      r)   rg   zEsmModel.forward  s   > -t";< 	[YZZZ  OO#) ,  M
 ;+/BBB%2%8"%="J
%!&j*-E}Oc!d!d!d+/+K+KZ,D ,L , ,N ;! 	3&;&G=R=W=W=Y=Y: 7$68O#P %-).4HQ^Qe)f)f)f&.2.H.HI_.`.`++.2+ &&y$+2OPP	&$,
)"7#B
 
 
 
 *!,8<8OO444UY;-'
 
 
 	
r+   c                 z    | ||dd          j         }t          j        |d          }||                    d                              d                              d          z  }||                    d                              d                              d          z  }|                     ||          S )NT)r   return_dictoutput_attentionsr   r!   r   r
      )r   r$   stackr~   ro  )rU   r   r   attnss       r)   predict_contactszEsmModel.predict_contacts  s    VN`deeepEq)))
 	))!,,66q99CCAFFF))!,,66q99CCAFFF  ///r+   )T)NNNNNNN)rh   ri   rj   rk   rM   rt  rv  rz  r   r   r   r$   rl   r   r   r   ro   r   rg   r  rp   rq   s   @r)   rk  rk  y  s}       
 
     (/ / /0 0 0C C C  -115/3,0048<9=O
 O
EL)O
 !.O
 u|,	O

 EL)O
  -O
  (5O
 !) 6O
 +,O
 
uU\"$PP	QO
 O
 O
 ^ O
b	0 	0 	0 	0 	0 	0 	0r+   rk  c                   d    e Zd ZdgZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e
j                 dee         deeef         fd                        Zd Z xZS )EsmForMaskedLMzlm_head.decoder.weightc                 0   t                                          |           |j        rt                              d           t          |d          | _        t          |          | _        | 	                                 | 
                                 d S )NzjIf you want to use `EsmForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Frq  )rL   rM   r   loggerwarningrk  rN  r[  lm_headinit_weightsrp  r   s     r)   rM   zEsmForMaskedLM.__init__  s        	NN1  
 Fe<<< ((r+   c                     | j         j        S r   r  decoderr^  s    r)   r_  z$EsmForMaskedLM.get_output_embeddings  s    |##r+   c                     || j         _        d S r   r  )rU   new_embeddingss     r)   set_output_embeddingsz$EsmForMaskedLM.set_output_embeddings  s    -r+   Nr   r   r   r   r   r   r   labelsr   re   c	           
      r    | j         |f||||||d|	}
|
d         }|                     |          }d}|et                      }|                    |j                  } ||                    d| j        j                  |                    d                    }t          |||
j	        |
j
                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        )r   r   r   r   r   r   r   Nr    losslogitsr   r   )rN  r  r   r\   rY   r   r   r   r   r   r   )rU   r   r   r   r   r   r   r   r  r   outputsr  prediction_scoresmasked_lm_lossloss_fcts                  r)   rg   zEsmForMaskedLM.forward"  s    * $(	
)%'"7#9	
 	
 	
 	
 "!* LL99'))HYY0788F%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN$!/)	
 
 
 	
r+   c                 :    | j                             ||          S )N)r   )rN  r  )rU   r   r   s      r)   r  zEsmForMaskedLM.predict_contactsR  s    x(((OOOr+   )NNNNNNNN)rh   ri   rj   _tied_weights_keysrM   r_  r  r   r   r   r$   
LongTensorrl   r   r   r   r   ro   r   rg   r  rp   rq   s   @r)   r  r    so       23     $ $ $. . .  151537,059=A9=-1,
 ,
E,-,
 !.,
 u/0	,

 EL),
   12,
  ((9:,
 !) 6,
 )*,
 +,,
 
un$	%,
 ,
 ,
 ^ ,
\P P P P P P Pr+   r  c                   (     e Zd ZdZ fdZd Z xZS )r[  z&ESM Head for masked language modeling.c                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j	        d          | _
        t          j        t          j        |j	                            | _        d S )Nr   F)r{   )rL   rM   r   rw   r   r   r   r   r   r   r  	Parameterr$   zerosr{   r   s     r)   rM   zEsmLMHead.__init__Y  s    Yv163EFF
,v'9v?TUUUy!3V5FUSSSLV->!?!?@@			r+   c                     |                      |          }t          |          }|                     |          }|                     |          | j        z   }|S r   )r   r8   r   r  r{   rU   featuresr   r&   s       r)   rg   zEsmLMHead.forwarda  sL    JJx  GGOOA LLOOdi'r+   rh   ri   rj   rk   rM   rg   rp   rq   s   @r)   r[  r[  V  sR        00A A A A A      r+   r[  z
    ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                       e Zd Z fdZee	 	 	 	 	 	 ddeej                 deej	                 deej                 deej	                 deej
                 deej                 d	ee         d
eeef         fd                        Z xZS )EsmForSequenceClassificationc                    t                                          |           |j        | _        || _        t	          |d          | _        t          |          | _        |                                  | 	                                 d S NFr  )
rL   rM   
num_labelsr   rk  rN  EsmClassificationHead
classifierr  rp  r   s     r)   rM   z%EsmForSequenceClassification.__init__r  sw        +Fe<<</77r+   Nr   r   r   r   r   r  r   re   c                     | j         |f||||d|}|d         }	|                     |	          }
d}|t|                    |
j                  }| j        j        f| j        dk    rd| j        _        nN| j        dk    r7|j        t          j	        k    s|j        t          j
        k    rd| j        _        nd| j        _        | j        j        dk    rWt                      }| j        dk    r1 ||
                                |                                          }n ||
|          }n| j        j        dk    rGt                      } ||
                    d| j                  |                    d                    }n*| j        j        dk    rt                      } ||
|          }t!          ||
|j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r   r   r   r   r   Nr   rx   single_label_classificationmulti_label_classificationr    r  )rN  r  r\   rY   r   problem_typer  rK   r$   r   rn   r	   r   r   r   r   r   r   r   rU   r   r   r   r   r   r  r   r  r  r  r  r  s                r)   rg   z$EsmForSequenceClassification.forward~  s   & $(
)%'
 
 
 
 "!*11YYv}--F{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x//'!/)	
 
 
 	
r+   NNNNNN)rh   ri   rj   rM   r   r   r   r$   r  rl   r   r   r   r   ro   r   rg   rp   rq   s   @r)   r  r  k  s	       
 
 
 
 
  151537,059-1:
 :
E,-:
 !.:
 u/0	:

 EL):
   12:
 )*:
 +,:
 
u..	/:
 :
 :
 ^ :
 :
 :
 :
 :
r+   r  c                       e Zd Z fdZee	 	 	 	 	 	 ddeej                 deej	                 deej                 deej	                 deej
                 deej                 d	ee         d
eeef         fd                        Z xZS )EsmForTokenClassificationc                 b   t                                          |           |j        | _        t          |d          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  |                                  d S r  )rL   rM   r  rk  rN  r   r   r   r   rw   r   r  r  rp  r   s     r)   rM   z"EsmForTokenClassification.__init__  s        +Fe<<<z&"<==)F$68IJJr+   Nr   r   r   r   r   r  r   re   c                     | j         |f||||d|}|d         }	|                     |	          }	|                     |	          }
d}|`t                      }|                    |
j                  } ||
                    d| j                  |                    d                    }t          ||
|j	        |j
                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        r  r   Nr    r  )rN  r   r  r   r\   rY   r   r  r   r   r   r  s                r)   rg   z!EsmForTokenClassification.forward  s    " $(
)%'
 
 
 
 "!*,,7711'))HYYv}--F8FKKDO<<fkk"ooNND$!/)	
 
 
 	
r+   r  )rh   ri   rj   rM   r   r   r   r$   r  rl   r   r   r   r   ro   r   rg   rp   rq   s   @r)   r  r    s	       
 
 
 
 
  151537,059-1)
 )
E,-)
 !.)
 u/0	)

 EL))
   12)
 )*)
 +,)
 
u++	,)
 )
 )
 ^ )
 )
 )
 )
 )
r+   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z-Head for sentence-level classification tasks.c                    t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        t          j        |j        |j	                  | _
        d S r   )rL   rM   r   rw   r   r   r   r   r   r  out_projr   s     r)   rM   zEsmClassificationHead.__init__  sc    Yv163EFF
z&"<==	&"4f6GHHr+   c                     |d d dd d f         }|                      |          }|                     |          }t          j        |          }|                      |          }|                     |          }|S rI  )r   r   r$   tanhr  r  s       r)   rg   zEsmClassificationHead.forward  sj    QQQ111WLLOOJJqMMJqMMLLOOMM!r+   r  rq   s   @r)   r  r    sR        77I I I I I      r+   r  c                     |                      |                                          }t          j        |d                              |          |z  }|                                |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r!   )r}   rn   r$   cumsumrZ   r   )r   r   maskincremental_indicess       r)   r   r     s`     <<$$((**D,t333;;DAADH##%%33r+   )r  r  r  rk  rM  )r   N)Frk   r5   typingr   r   r   r$   r   torch.nnr   r   r	   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.genericr   r   configuration_esmr   
get_loggerrh   r  r*   r1   r8   r;   rE   ModulerG   rs   r   rl   rP   r   r   r   r  r  r  r   r2  rE  rM  rk  r  r[  r  r  r  r   __all__r5  r+   r)   <module>r     s#       , , , , , , , , , ,        A A A A A A A A A A 9 9 9 9 9 9              G F F F F F F F & & & & & & Q Q Q Q Q Q Q Q R R R R R R R R R R R R ? ? ? ? ? ? ? ? ( ( ( ( ( ( 
	H	%	%( ( (
. . .; ; ;# # #
	 	 	)
 )
 )
 )
 )
eho )
 )
 )
X G  G  G  G  Gry  G  G  GF\= \= \= \= \=BI \= \= \=L (,/% /%I/%</% 
/% <	/%
 U\*/% /% /% %/% '(/% /% /% /%dV) V) V) V) V)ry V) V) V)r
 
 
 
 
BI 
 
 
- - - - -29 - - -`    bi   
 
 
 
 
	 
 
 
7 7 7 7 7) 7 7 7t S  S  S  S  S  S  S  SH    	    * * * * * * * *Z K0 K0 K0 K0 K0! K0 K0 K0\ JP JP JP JP JP' JP JP JPZ    	   *   I
 I
 I
 I
 I
#5 I
 I
 I
X 8
 8
 8
 8
 8
 2 8
 8
 8
v    BI   &4 4 4   r+   